In [1]:
import itertools
import numpy as np
import pandas as pd
# for data scaling and splitting
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import train_test_split
# for neural net
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# for evaluation
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

Using TensorFlow backend.


In [2]:
data = pd.read_csv("data/combined_expression.csv")
data.head()

Unnamed: 0,CELL_LINE_NAME,classification,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,COL15A1,C6orf10,TMEM225,NOTCH4,PBX2,AGER,RNF5,AGPAT1,DFNB59,PRRT1
0,1240121,1,6.419526,3.182094,9.320548,3.759654,3.802619,3.215753,4.698729,7.873672,...,3.245454,2.953508,3.543429,3.352022,4.67231,3.641128,3.13531,3.737072,3.450927,3.1688
1,1240122,2,7.646494,2.626819,10.153853,3.564755,3.942749,3.29076,3.551675,8.252413,...,2.786709,3.077382,3.728232,3.208882,4.58684,3.395654,3.5868,3.519128,3.115323,3.051645
2,1240123,1,8.319417,3.111183,9.643558,4.757258,3.919757,3.602185,3.329644,9.07695,...,3.459089,3.085394,3.462811,3.33903,4.614897,3.395845,3.419193,3.971646,3.72931,3.320022
3,1240124,1,9.006994,3.028173,9.6867,4.280504,3.147646,3.188881,3.293807,8.67879,...,2.835403,2.960303,3.415083,3.290171,4.770123,3.400821,3.383734,3.798107,2.822404,3.297547
4,1240127,1,7.985676,2.694729,10.676134,4.159685,3.804637,3.481942,3.111261,7.555407,...,2.896523,2.849899,3.480114,3.226128,5.83271,3.612179,3.347095,4.457963,5.198524,4.553586


In [3]:
selected_genes = pd.read_csv('cleaned/boruta-99-25-0.01.csv')
selected_genes = selected_genes.values.tolist()
selected_genes = list(itertools.chain(*selected_genes))

In [4]:
# retrieving proper columns
X = data.loc[:, selected_genes]
y = data['classification'].values
# scaling the data
scalar = MinMaxScaler()
x_scaled = scalar.fit_transform(X)
# splitting data (20% test, 80% train)
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=0)

# Gridsearch for Input and Output Layer (no hidden layers)

## Optimizing Epochs and Batches

In [5]:
def create_model():
    model = Sequential()
    # adding layers
    model.add(Dense(len(selected_genes), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compiling
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [15]:
model = KerasClassifier(build_fn=create_model)
epochs = [10, 25, 50, 100]
batches = [16, 32, 64, 128]
param_grid = dict(epochs=epochs, batch_size=batches)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=0, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [17]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.754386 using {'batch_size': 64, 'epochs': 50}


In [43]:
grid_result.cv_results_

{'mean_fit_time': array([ 8.89025078, 12.75803318, 19.26391606, 34.30394907,  8.20030737,
        11.34873967, 13.6889308 , 18.04424515,  6.45221438,  8.43450785,
         9.72793574, 14.60181489,  7.19966426,  6.71270518,  6.27273889,
         8.398774  ]),
 'std_fit_time': array([0.16444237, 0.12779544, 0.54525051, 0.20913776, 0.52294618,
        0.51423753, 1.31642935, 0.08993733, 0.33960881, 0.32237099,
        0.4168923 , 0.24949539, 0.47613226, 0.29489586, 0.36781995,
        0.1170051 ]),
 'mean_score_time': array([0.94311066, 0.66564546, 1.10282264, 0.70391574, 0.9941628 ,
        1.18192902, 0.82030702, 0.78884726, 0.9586884 , 0.68544936,
        0.96039133, 0.6519465 , 0.95165958, 0.60001125, 0.47210975,
        0.18315883]),
 'std_score_time': array([0.03113768, 0.02067126, 0.14116664, 0.15304386, 0.22508128,
        0.21197436, 0.17951082, 0.15242896, 0.07337433, 0.09686909,
        0.27478934, 0.0482765 , 0.19803076, 0.03923923, 0.16044839,
        0.04926153]),
 'param_ba

## Tuning the Training Optimization Algorithm

In [47]:
def create_model2(optimizer='adam'):
    model = Sequential()
    # adding layers
    model.add(Dense(len(selected_genes), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compiling
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [48]:
model = KerasClassifier(build_fn=create_model2, epochs=50, batch_size=64)
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
grid_result = grid.fit(X_train, y_train)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [49]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.734893 using {'optimizer': 'Adamax'}


# Testing the Model

In [None]:
model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=64)
kfold = KFold(n_splits=5, shuffle=True)
results = cross_val_score(model, X_train, y_train, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
model.fit(X_train, y_train)
test_predictions = model.predict(X_test)

In [None]:
print(classification_report(y_test, test_predictions))

In [None]:
print(confusion_matrix(y_test,test_predictions))

In [None]:
model.save('model/model_1.h5')