In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from keras import layers
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

### Baseline results - No Preprocessing

In [10]:
# Train and Test variables
x_train = np.array(train.drop(['X10','Y'],axis=1))
y_train = np.array(train['Y'])

x_test = np.array(test.drop(['X10','Y'],axis=1))
y_test = np.array(test['Y'])

# Model creation
model = tf.keras.Sequential([
    tf.keras.Input(shape=(9,)),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(64, activation=tf.nn.relu,),
    tf.keras.layers.Dense(32, activation=tf.nn.relu),
    tf.keras.layers.Dense(16, activation=tf.nn.relu),
    tf.keras.layers.Dense(8, activation=tf.nn.relu),
    tf.keras.layers.Dense(4, activation=tf.nn.relu),
    tf.keras.layers.Dense(1),
])

model.compile(optimizer='adam',loss='mean_absolute_error',metrics=['mean_absolute_percentage_error'])
history = model.fit(x_train, y_train, batch_size=32, epochs=20, validation_split = 0.2,verbose=1)
model.evaluate(x_test, y_test, verbose=2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
32/32 - 0s - loss: 54.6448 - mean_absolute_percentage_error: 6.3325 - 119ms/epoch - 4ms/step


[54.6447639465332, 6.332549095153809]

## Actual Exploration

### Data Processing

In [661]:
# Linear Scaling - Help Gradient descent converge quicker towards minima
train_x1_x9 = train.drop(['X10','Y'],axis=1)
train_x1_x9 = (train_x1_x9 - train_x1_x9.min()) / (train_x1_x9.max() - train_x1_x9.min())

test_x1_x9 = test.drop(['X10','Y'],axis=1)
test_x1_x9 = (test_x1_x9 - test_x1_x9.min()) / (test_x1_x9.max() - test_x1_x9.min())

# One Hot Encoding for categorical column 
train_x10 = pd.get_dummies(train['X10'],dtype=int)
test_x10 = pd.get_dummies(test['X10'],dtype=int)

x_train = np.array(pd.concat([train_x1_x9,train_x10],axis=1))
y_train = np.array(train['Y'])

x_test = np.array(pd.concat([test_x1_x9,test_x10],axis=1))
y_test = np.array(test['Y'])

### Hyperparameter Tuning - Grid Search

In [None]:
# Model creation
def create_model(optimizer,learning_rate):
    model = tf.keras.Sequential([
        keras.Input(shape=(15,)),
        layers.Dense(128,activation=tf.nn.relu),
        layers.Dense(64, activation=tf.nn.relu,),
        layers.Dense(128, activation=tf.nn.relu),
        layers.Dense(64, activation=tf.nn.relu),
        layers.Dense(32, activation=tf.nn.relu),
        layers.Dense(4, activation=tf.nn.relu),
        layers.Dense(1,activation=tf.nn.relu)
    ])
    opt = keras.optimizers.Adam(learning_rate=learning_rate) if optimizer == 'adam' else keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(optimizer=opt,loss='mean_absolute_error',metrics=['mean_absolute_percentage_error'])
    return model

model = KerasRegressor(model=create_model,epochs=20,batch_size=128)

param_grid = {
    'model__optimizer': ['adam','sgd'],
    'model__learning_rate': [0.005,0.01,0.05],
    'batch_size': [32,64,128],
    'epochs': [20,30,40]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_absolute_error',cv=5,verbose=2)
grid_result = grid.fit(x_train,y_train)

In [678]:
print(f"Best Negative MAE: {grid_result.best_score_} using {grid_result.best_params_}")

Best Negative MAE: -9.651480784693238 using {'batch_size': 64, 'epochs': 30, 'model__learning_rate': 0.005, 'model__optimizer': 'adam'}


In [None]:
# Due to Random nature of NN, need to run a few times to get optimal results
best_model = {'model_mae': 9999}
for i in range(30):
    model = tf.keras.Sequential([
        keras.Input(shape=(15,)),
        layers.Dense(128,activation=tf.nn.relu),
        layers.Dense(64, activation=tf.nn.relu,),
        layers.Dense(128, activation=tf.nn.relu),
        layers.Dense(64, activation=tf.nn.relu),
        layers.Dense(32, activation=tf.nn.relu),
        layers.Dense(4, activation=tf.nn.relu),
        layers.Dense(1,activation=tf.nn.relu)
    ])
    opt = keras.optimizers.Adam(learning_rate=0.005)
    model.compile(optimizer=opt,loss='mean_absolute_error',metrics=['mean_absolute_percentage_error'])
    model.fit(x_train, y_train, batch_size=64, epochs=30, validation_split = 0.2,verbose=0)
    
    model_mae, model_mape = model.evaluate(x_test, y_test, verbose=1)

    if model_mae < best_model['model_mae']:
        best_model['model'] = model
        best_model['model_mae'] = model_mae


In [677]:
print(f"MAE of best model {best_model['model_mae']}")

MAE of best model 6.389989852905273


In [676]:
# Save predictions
preds = pd.DataFrame(best_model['model'].predict(x_test),columns=['y_pred'])
pd.concat([test,preds],axis=1).to_csv('test_pred.csv',index=False)



In [652]:
# Save best result
best_model['model'].save('best_model.keras')

### Extra Exploration - Random Forest and XGBoost

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [502]:
regr = RandomForestRegressor(n_estimators=100,random_state=42,criterion='absolute_error')
regr.fit(x_train,y_train)

In [665]:
mean_absolute_error(y_test,regr.predict(x_test))

13.999135957161648

In [663]:


clf = xgb.XGBRegressor(n_estimators=1000,learning_rate=0.01,random_state=42,eval_metric='mae')
clf.fit(x_train,y_train)


In [664]:
mean_absolute_error(y_test,clf.predict(x_test))

10.68182660248541