In [4]:
import itertools
import numpy as np
import pandas as pd
# for data scaling and splitting
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# for neural net
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import regularizers
# for evaluation
from sklearn.model_selection import GridSearchCV

In [5]:
data = pd.read_csv("data/combined_expression.csv")
data.head()

Unnamed: 0,CELL_LINE_NAME,cluster,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,C6orf10,TMEM225,NOTCH4,PBX2,AGER,RNF5,AGPAT1,DFNB59,PRRT1,FKBPL
0,1240123,2,8.319417,3.111183,9.643558,4.757258,3.919757,3.602185,3.329644,9.07695,...,3.085394,3.462811,3.33903,4.614897,3.395845,3.419193,3.971646,3.72931,3.320022,6.447316
1,1240131,1,7.611268,2.704739,10.276079,3.650299,3.481567,3.145538,3.565127,7.861068,...,2.801456,2.985889,3.180068,5.415729,3.299858,3.028414,3.877889,3.911516,3.379405,4.729557
2,1240132,1,7.678658,2.845781,10.180954,3.573048,3.431235,3.090781,4.116643,8.12119,...,2.934962,2.952937,3.164655,5.707506,3.434295,2.961345,4.272194,3.085696,3.002557,5.653588
3,1240134,1,3.265063,3.063746,10.490285,3.340791,3.676912,3.512821,3.873922,8.790851,...,3.041839,3.398847,3.10671,5.773963,3.412641,3.13611,4.422262,3.522122,3.509437,5.953242
4,1240140,1,7.090138,2.988043,10.264692,4.119555,3.432585,3.308033,3.318371,6.927761,...,3.028787,3.225982,3.27582,5.334283,3.864678,3.259242,3.840581,5.809553,3.674587,5.577503


In [6]:
data.shape

(541, 16384)

In [7]:
selected_genes = pd.read_csv('cleaned/boruta.csv')
selected_genes = selected_genes.values.tolist()
selected_genes = list(itertools.chain(*selected_genes))

In [8]:
# retrieving proper columns
X = data.loc[:, selected_genes]
y = data['cluster'].values

# scaling the data
scalar = MinMaxScaler()
x_scaled = scalar.fit_transform(X)

# splitting data (20% test, 80% train)
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=0)

# Gridsearch

## 1 Hidden Layer

In [9]:
def one_layer(optimizer='sgd', init='normal', dropout=0.3, neurons=100):
    model = Sequential()
    # hidden layer
    model.add(Dense(neurons, activation='relu', kernel_regularizer=regularizers.l2(1e-4)))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    # output layer
    model.add(Dense(1, activation='sigmoid'))
    # compiling
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [10]:
model = KerasClassifier(build_fn=one_layer)
# parameters
epochs = [25, 50, 75]
batches = [16, 32, 64]
optimizers = ['sgd', 'adagrad', 'adam']
neurons = [100, 200, 300]
# grid search
param_grid = dict(epochs=epochs, batch_size=batches, optimizer=optimizers, neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:  4.7min finished


Train on 432 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [11]:
print("Best: %f using %s for 1 hidden layer" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.877315 using {'batch_size': 32, 'epochs': 25, 'neurons': 100, 'optimizer': 'sgd'} for 1 hidden layer


## 5 Hidden Layers

In [20]:
def five_layers(optimizer='sgd', init='normal', dropout=0.3, neurons=100):
    model = Sequential()
    hidden_layers = len(selected_genes)
    # add 5 hidden layers
    for i in range(5):
        model.add(Dense(neurons, activation='relu', kernel_regularizer=regularizers.l2(1e-4)))
        model.add(BatchNormalization())
        model.add(Dropout(dropout))

    model.add(Dense(1, activation='sigmoid'))
    # compiling
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [13]:
model = KerasClassifier(build_fn=five_layers)
# parameters
epochs = [25, 50, 75]
batches = [16, 32, 64]
optimizers = ['sgd', 'adagrad', 'adam']
neurons = [100, 200, 300]
# grid search
param_grid = dict(epochs=epochs, batch_size=batches, optimizer=optimizers, neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 16.6min finished
Train on 432 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [14]:
print("Best: %f using %s for 5 hidden layers" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.884259 using {'batch_size': 16, 'epochs': 25, 'neurons': 300, 'optimizer': 'sgd'} for 5 hidden layers
