### Load data from csv

In [1]:
# Load data from csv
import pandas as pd
import numpy as np
import sidetable
from matplotlib import pyplot as plt
%matplotlib inline

df = pd.read_csv('data.csv')
df.columns = df.columns.str.lower()


### Preprocess data

In [2]:
# Drop unnecessary columns and rows having missing / null values
columns_to_drop = [1, 3, 4, 5, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 87]
df.drop(labels=df.columns[columns_to_drop], axis=1, inplace=True)

# Turn columns to lower case
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.set_index('id', inplace=True)

In [3]:
# Remove currency symbols in Wage and Value columns, turn them into numeric type, and change the column names
df['wage'] = np.where(df['wage'].str[-1].isin('M K'.split()), df['wage'].str[1:-1], np.nan)
df['value'] = np.where(df['value'].str[-1].isin('M K'.split()), df['value'].str[1:-1], np.nan)
df['wage'] = df['wage'].astype(dtype='float')
df['value'] = df['value'].astype(dtype='float')

In [4]:
'''
    Convert work_rate into numeric type using the following map:
        Low = 1 point, Medium = 2 points, High = 3 points    
'''
def work_rate(x):
    if x in ['Low/ Low']:
        return 2
    elif x in ['Low/ Medium', 'Medium/ Low']:
        return 3
    elif x in ['Low/ High', 'Medium/ Medium', 'High/ Low']:
        return 4
    elif x in ['High/ Medium', 'Medium/ High']:
        return 5
    elif x in ['High/ High']:
        return 6

df['work_rate'] = df['work_rate'].apply(work_rate)

In [5]:
# This function is to remove the plus sign and add the two numbers together in columns ['ls': 'rb']
def clean_plus_sign(x):
    if isinstance(x, str):
        x = x.split('+')
        x = list(map(int, x))
        return sum(x)
    
for i in df.loc[:,'ls': 'rb'].columns:
    df[i] = df[i].apply(clean_plus_sign)

In [6]:
# Convert these columns to float since Keras does not support int type
for col in df.select_dtypes(include='int64').columns:
    df[col] = df[col].astype('float')

In [7]:
X = df.drop(columns='value')
y = df.value.copy()

In [8]:
# Retrive num_cols and cat_cols
num_cols = list(X._get_numeric_data().columns) # Another way: X.select_dtypes(include='number').columns
cat_cols = list(set(X.columns) - set(num_cols)) # Another way: X.select_dtypes(exclude='number').columns

In [9]:
my_cols = X.columns

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

cat_pipe = make_pipeline(
            (SimpleImputer(strategy='most_frequent')),
            (OneHotEncoder(drop='first'))
            )
num_pipe = make_pipeline(
            (KNNImputer()),
            (StandardScaler())
            )            
preprocess_pipeline = make_column_transformer(
            (cat_pipe, cat_cols),
            (num_pipe, num_cols)
            )

In [11]:
processor = preprocess_pipeline.fit(X)
X = processor.transform(X)
X = pd.DataFrame(data=X, columns=my_cols)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)

In [13]:
'''
Credit source: 
    https://www.tensorflow.org/tutorials/keras/keras_tuner
    https://github.com/keras-team/keras-tuner/blob/master/examples/cifar10.py
'''

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Activation, Dense, Flatten, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam

# Create the keras tuner model.
def build_model(hp):
    hp_drop_out = hp.Float('dropout', 0, 0.5, step=0.1)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model = Sequential()
    for i in range(hp.Int('num_layers', 2, 20)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32), activation=tf.nn.leaky_relu, kernel_regularizer=keras.regularizers.l1_l2(l1=1e-5, l2=1e-4)))
        model.add(BatchNormalization())
        model.add(Dropout(hp_drop_out))
    model.add(Dense(1, activation='linear', kernel_regularizer='l2'))
    opt = Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]))
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                loss='mse',
                metrics=['mse'])
    return model

import kerastuner as kt

tuner = kt.Hyperband(build_model,
                     objective='val_mse', 
                     max_epochs=16,
                     overwrite=False)

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json


In [14]:
es = EarlyStopping(monitor='mse', verbose=0, patience=10, min_delta=1e-3)
tuner.search(X_train, y_train, epochs=64, batch_size=128, validation_data=(X_val, y_val), callbacks=[es])

INFO:tensorflow:Oracle triggered exit


In [15]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=64, validation_data=(X_val, y_val))

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


In [16]:
val_acc_per_epoch = history.history['val_mse']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Best epoch: 1


In [17]:
# Re-instantiate the hypermodel and train it with the optimal number of epochs from above.
hypermodel = tuner.hypermodel.build(best_hps)
# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch)



<tensorflow.python.keras.callbacks.History at 0x2691d8bf088>

In [18]:
eval_result = hypermodel.evaluate(X_val, y_val)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [nan, nan]


In [19]:
# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch)



<tensorflow.python.keras.callbacks.History at 0x2692a25e908>

In [20]:
# from keras.models import save_model
# hypermodel.save('best_model.h5')

In [21]:
# Load the model and predict
# from keras.models import load_model
# hypermodel = load_model('best_model.h5')
y_pred = hypermodel.predict(X_val)
y_pred = y_pred.astype(int)

In [22]:
y_pred = pd.DataFrame(y_pred, columns=['value'], index=X_val.index)
# saving the dataframe 
y_pred.to_csv('Predictions.csv') 