In [2]:
import itertools
import numpy as np
import pandas as pd
# for data scaling and splitting
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import train_test_split
# for neural net
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# for evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
data = pd.read_csv("data/combined_expression.csv")
data.head()

Unnamed: 0,CELL_LINE_NAME,cluster,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,C6orf10,TMEM225,NOTCH4,PBX2,AGER,RNF5,AGPAT1,DFNB59,PRRT1,FKBPL
0,1240123,2,8.319417,3.111183,9.643558,4.757258,3.919757,3.602185,3.329644,9.07695,...,3.085394,3.462811,3.33903,4.614897,3.395845,3.419193,3.971646,3.72931,3.320022,6.447316
1,1240131,3,7.611268,2.704739,10.276079,3.650299,3.481567,3.145538,3.565127,7.861068,...,2.801456,2.985889,3.180068,5.415729,3.299858,3.028414,3.877889,3.911516,3.379405,4.729557
2,1240132,3,7.678658,2.845781,10.180954,3.573048,3.431235,3.090781,4.116643,8.12119,...,2.934962,2.952937,3.164655,5.707506,3.434295,2.961345,4.272194,3.085696,3.002557,5.653588
3,1240134,3,3.265063,3.063746,10.490285,3.340791,3.676912,3.512821,3.873922,8.790851,...,3.041839,3.398847,3.10671,5.773963,3.412641,3.13611,4.422262,3.522122,3.509437,5.953242
4,1240140,3,7.090138,2.988043,10.264692,4.119555,3.432585,3.308033,3.318371,6.927761,...,3.028787,3.225982,3.27582,5.334283,3.864678,3.259242,3.840581,5.809553,3.674587,5.577503


In [4]:
data.shape

(541, 16384)

In [5]:
selected_genes = pd.read_csv('cleaned/boruta.csv')
selected_genes = selected_genes.values.tolist()
selected_genes = list(itertools.chain(*selected_genes))

In [6]:
# retrieving proper columns
X = data.loc[:, selected_genes]
y = data['cluster'].values

# scaling the data
scalar = MinMaxScaler()
x_scaled = scalar.fit_transform(X)

# splitting data (20% test, 80% train)
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=0)

# Gridsearch for Input and Output Layer

In [7]:
def create_model(optimizer='rmsprop', init='glorot_uniform', dropout=0.3):
    model = Sequential()
    # adding layers and adding droplayers to avoid overfitting
    hidden_layers = len(selected_genes)

    # first hidden layer
    model.add(Dense(hidden_layers, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    # second hidden layer
    model.add(Dense((hidden_layers*1.5), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    # third hidden layer
    model.add(Dense((hidden_layers), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    # fourth hidden layer
    model.add(Dense((hidden_layers*0.25), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))

    model.add(Dense(3, activation='softmax'))
    # compiling
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [8]:
model = KerasClassifier(build_fn=create_model)
# parameters
epochs = [50, 75, 100, 150]
batches = [16, 32, 64, 128]
optimizers = ['SGD', 'RMSprop', 'Adagrad', 'Adam', 'Adamax']
init = ['glorot_uniform', 'normal', 'uniform']
# grid search
param_grid = dict(epochs=epochs, batch_size=batches,optimizer=optimizers,init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 36.2min finished


Train on 432 samples
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


In [9]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.717593 using {'batch_size': 64, 'epochs': 75, 'init': 'normal', 'optimizer': 'Adagrad'}


In [10]:
grid_result.cv_results_

{'mean_fit_time': array([29.82347385, 37.74450461, 35.22818104, 33.04543535, 24.96807607,
        21.58027943, 32.26096543, 30.20425502, 29.15370663, 30.0019455 ,
        27.36595496, 34.31726933, 32.10289224, 29.20354907, 29.91525769,
        33.77772776, 45.74009275, 40.60837603, 39.95916279, 42.66015267,
        29.5885663 , 42.77599263, 38.91595801, 40.21461463, 38.98714749,
        31.22201371, 41.63639577, 41.12591434, 39.31136401, 41.82191714,
        42.72427487, 57.80267223, 51.43229238, 51.02590537, 51.8656443 ,
        42.83516129, 60.21969334, 56.94429874, 55.70062669, 52.36726491,
        42.6482842 , 54.9538513 , 51.80941931, 49.99627304, 50.91359949,
        61.86388334, 81.02049271, 78.52006475, 72.58623608, 67.39916913,
        58.84941808, 80.84872238, 74.53146919, 72.15816601, 69.82273014,
        59.62388563, 80.52583996, 75.09343799, 69.35380491, 68.72098303,
        24.83556604, 29.24116723, 23.31435672, 22.01664575, 21.57369796,
        18.20750999, 22.82201401, 