In [2]:
import csv
import json
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import tensorflow as tf
from collections import defaultdict
np.random.seed(90501)

In [7]:
from tensorflow.keras.utils import to_categorical

# train set
train_set = pd.read_csv('../data/final/train_reconstructed.csv')
X_train = train_set.iloc[:,:-1].values
y_train = train_set['edge'].values

# dev set
test_set = pd.read_csv('../data/final/dev-test.csv')
X_test = test_set.iloc[:,:-1].values
y_test = test_set['edge'].values

X_train = X_train.reshape(-1, X_train.shape[1], 1)
X_test = X_test.reshape(-1, X_test.shape[1], 1)

y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)

X_flatten_train = [instance.flatten() for instance in X_train]
X_flatten_test = [instance.flatten() for instance in X_test]

Just using one layer and seeing what happens gives us a good idea of what the simplest model could give us in terms of accuracy (around 72% - 78%)

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import SGD

nnmodel_1 = Sequential()
nnmodel_1.add(Flatten(input_shape= X_train.shape[1:]))
nnmodel_1.add(Dense(2, activation='softmax'))

nnmodel_1.compile(loss='categorical_crossentropy', optimizer = SGD(lr=0.01, decay =2e-6, momentum =0.9, nesterov = True),
                 metrics = ['AUC'])

nnmodel_1.summary()
nnmodel_1.fit(X_train, y_train, epochs = 20, batch_size = 128)

score = nnmodel_1.evaluate(X_test, y_test, batch_size = 128)

print("\nOn the test set: the loss is {} and the AUC is {}".format(score[0], score[1]))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 2)                 22        
Total params: 22
Trainable params: 22
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

On the test set: the loss is 1.1942864656448364 and the AUC is 0.7699442505836487


We now set a more complex model testing these activation functions: **['relu', 'tanh', 'sigmoid', 'selu', 'elu']**. Using two layers of 11 neurons each. We get that usually the best activation function is **tanh** with nearly **78% - 80%** of the accuracy in the test set.

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import SGD

max_accuracy = 0
for item in ['relu', 'tanh', 'sigmoid', 'selu', 'elu']:
    nnmodel_2 = Sequential()
    nnmodel_2.add(Flatten(input_shape = X_train.shape[1:]))
    nnmodel_2.add(Dense(11, activation = item))
    nnmodel_2.add(Dense(11, activation = item))
    nnmodel_2.add(Dense(2, activation = 'softmax'))

    nnmodel_2.compile(loss='categorical_crossentropy',
             optimizer = SGD(lr = 0.01, decay = 1e-6, momentum=0.9, nesterov = True),
             metrics = ['AUC'])

    nnmodel_2.fit(X_train, y_train, epochs = 20, batch_size = 256)

    score = nnmodel_2.evaluate(X_test, y_test, batch_size = 256)
    print(score)
    print("On the test set: the loss is {} and the AUC is {}".format(score[0], score[1]))
    accuracy = score[1]
    if accuracy >= max_accuracy:
        max_accuracy = accuracy
        iteration_max = item
    print(f"Max accuracy so far {max_accuracy} in iteration {iteration_max}")
    nnmodel_2.summary()

Epoch 1/20
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.7196690440177917, 0.8168138265609741]
On the test set: the loss is 0.7196690440177917 and the accuracy is 0.8168138265609741
Max accuracy so far 0.8168138265609741 in iteration relu
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 15)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 11)                176       
_________________________________________________________________
dense_2 (Dense)              (None, 11)                132       
___________________

In this case, we decide to tune the hyperparameters using KJerasClassifier and GridSearchCV

In [87]:
# pip install keras

In [182]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.layers import Activation, Dense, Dropout
import itertools

In this function, the model is created according to the specifications that we give as an input. This is very handy because we now can use the function GridSearchCV in order to get the "best" hyperparameters out of the set that we provided.

In [189]:
def create_model(layers, activation):
    model = Sequential()
    for i, nodes in enumerate(layers):
        if i == 0:
            model.add(Dense(nodes, input_dim = X_train.shape[1]))
            model.add(Activation(activation))
            model.add(Dropout(0.3))
        else:
            model.add(Dense(nodes))
            model.add(Activation(activation))
            model.add(Dropout(0.3))
    model.add(Dense(1))
    model.compile(loss='binary_crossentropy',
             optimizer = 'adam',
             metrics = ['AUC'])
    return model

model = KerasClassifier(build_fn = create_model, verbose = 0)

In [201]:
layers = []
for i in range(1, 4):
    sublayer = []
    sublayer.append([list(i) for i in itertools.combinations_with_replacement(range(8,14), i)])
    for item in sublayer[0]:
        layers.append(item)

In [202]:
activations = ['relu', 'tanh', 'sigmoid']
param_grid = dict(layers=layers, activation = activations, batch_size = [128], epochs = [20])
grid = GridSearchCV(estimator = model, param_grid = param_grid)

In [203]:
grid_result = grid.fit(X_train, y_train)

After running the previous chunck of code, we get that the most accurare parameters are the following. Nevertheless, it is not better at fitting the training data than the one selected in previous steps (0.96 accuracy vs 0.95 this time).

In [204]:
[grid_result.best_score_, grid_result.best_params_]
#[0.9504482170173003, {'activation': 'tanh', 'batch_size': 128, 'epochs': 20, 'layers': [8, 8, 8]}]

[0.9504482170173003,
 {'activation': 'tanh', 'batch_size': 128, 'epochs': 20, 'layers': [8, 8, 8]}]

Using values above, we now can get the "best fit" of those hyperparameters tested:

In [50]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.optimizers import SGD

nnmodel_3 = Sequential()
nnmodel_3.add(Flatten(input_shape = X_train.shape[1:]))
nnmodel_3.add(Dense(8, activation = "tanh"))
nnmodel_3.add(Dropout(0.3))
nnmodel_3.add(Dense(8, activation = "tanh"))
nnmodel_3.add(Dropout(0.3))
nnmodel_3.add(Dense(8, activation = "tanh"))
nnmodel_3.add(Dropout(0.3))
nnmodel_3.add(Dense(2, activation = "softmax"))

nnmodel_3.compile(loss='binary_crossentropy',
         optimizer = SGD(lr = 0.01, decay = 1e-6, momentum=0.9, nesterov = True),
         metrics = ['AUC'])

nnmodel_3.fit(X_train, y_train, epochs = 20, batch_size = 256)

score = nnmodel_3.evaluate(X_test, y_test, batch_size = 256)

print("On the test set: the loss is {} and the AUC is {}".format(score[0], score[1]))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
On the test set: the loss is 0.7103415727615356 and the AUC is 0.8569408655166626


In [51]:
score

[0.7103415727615356, 0.8569408655166626]

In [52]:
from sklearn import metrics

y_test_raw = test_set['edge'].values
y_test_predict = nnmodel_3.predict(X_test)

# get roc fpr and tpr
fpr, tpr, thresholds = metrics.roc_curve(y_test_raw, y_test_predict[:,1])

# results
print("AUC_test:{:.4f}".format(metrics.auc(fpr, tpr)))

AUC_test:0.8683


After runing the test againg, as stated previously, this model does not now an improvement against nnmodel_2 and that is the best NN model explored at the moment.

# Make prediction with full dataset

In [53]:
from tensorflow.keras.utils import to_categorical

# train set
train_set = pd.read_csv('../data/final/train_reconstructed.csv')
X_train = train_set.iloc[:,:-1].values
y_train = train_set['edge'].values

# dev set
test_set = pd.read_csv('../data/final/dev-test.csv')
X_test = test_set.iloc[:,:-1].values
y_test = test_set['edge'].values

X_big = np.concatenate((X_train,X_test),0)
y_big = np.concatenate((y_train,y_test),0)


#X_big = X_big.reshape(-1, X_big.shape[1], 1)
y_big = to_categorical(y_big, 2)
X_flatten_big = [instance.flatten() for instance in X_big]

In [55]:
# Training full model
test_final = pd.read_csv('../data/final/test-final.csv')
test_final = test_final.values

In [64]:
nnmodel_3 = Sequential()
nnmodel_3.add(Flatten(input_shape = X_big.shape[1:])) # here
nnmodel_3.add(Dense(8, activation = "tanh"))
nnmodel_3.add(Dropout(0.3))
nnmodel_3.add(Dense(8, activation = "tanh"))
nnmodel_3.add(Dropout(0.3))
nnmodel_3.add(Dense(8, activation = "tanh"))
nnmodel_3.add(Dropout(0.3))
nnmodel_3.add(Dense(2, activation = "softmax"))

nnmodel_3.compile(loss='binary_crossentropy',
         optimizer = SGD(lr = 0.01, decay = 1e-6, momentum=0.9, nesterov = True),
         metrics = ['AUC'])

nnmodel_3.fit(X_big, y_big, epochs = 20, batch_size = 256)  # here

score = nnmodel_3.evaluate(X_big, y_big, batch_size = 256)   # here

print("On the test set: the loss is {} and the AUC is {}".format(score[0], score[1]))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
On the test set: the loss is 0.15855763852596283 and the AUC is 0.9827247262001038


In [65]:
pred = nnmodel_3.predict(test_final)[:,1]
submission = {
    'Id': range(1,len(pred)+1),
    'Predicted': pred
}

submission_df = pd.DataFrame(data=submission)
submission_df.to_csv('../data/final/sub.csv', index=False)