> This notebook goes through a similar process as the previous one to train models on the UCI dataset, this time with deep learning models. Required libraries and the dataset are first imported.

In [1]:
import numpy as np
import pandas as pd
import pickle
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, auc, confusion_matrix, f1_score, classification_report
from sklearn.metrics import make_scorer, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.constraints import max_norm
from keras.wrappers.scikit_learn import KerasClassifier

In [2]:
df = pd.read_csv('EEG_UCI_dataset_powers.csv')
df = df.rename(columns={'Unnamed: 0': 'subject'})
df['status'] = (df['subject'].str.slice(start=3, stop=4) == "a").astype(int)
df.tail()

Unnamed: 0,subject,Fp1a delta,Fp1a theta,Fp1a alpha,Fp1a beta,Fp1a gamma,Fp2a delta,Fp2a theta,Fp2a alpha,Fp2a beta,...,P3/P4 theta,P3/P4 alpha,P3/P4 beta,P3/P4 gamma,O1/O2 delta,O1/O2 theta,O1/O2 alpha,O1/O2 beta,O1/O2 gamma,status
117,co3a0000458,5.034276,1.297364,2.77622,7.503791,1.389794,6.732382,1.791624,3.072971,8.030404,...,0.5543,0.502656,0.517526,0.541235,0.461837,0.480084,0.494039,0.503561,0.577265,1
118,co3a0000459,4.925952,1.284041,3.818269,4.602541,1.256179,4.730081,1.297648,3.812826,8.845018,...,0.476234,0.598763,0.56812,0.557745,0.508269,0.549875,0.526056,0.553384,0.539971,1
119,co3a0000460,5.413555,2.577622,2.853991,3.19214,0.737959,5.674537,2.723768,2.88598,3.208463,...,0.545288,0.59522,0.53965,0.551718,0.521833,0.565657,0.51636,0.473023,0.439638,1
120,co3a0000461,7.399629,1.507018,1.856139,2.528717,0.567038,5.505387,1.312715,2.250694,2.52448,...,0.605074,0.520692,0.566241,0.594854,0.525616,0.526378,0.529584,0.537405,0.588164,1
121,co3c0000402,5.284542,1.946691,1.135617,2.015422,0.492179,5.092513,1.986647,1.3372,2.166617,...,0.497312,0.492201,0.49275,0.493309,0.498716,0.516931,0.544294,0.517307,0.497229,0


In [3]:
print("Alcoholic subjects:", len(df.status.loc[df.status ==1]))
print("Control subjects:", len(df.status.loc[df.status == 0]))
print("Proportion of alcoholic subjects:", round(len(df.status.loc[df.status == 1]) / (len(df.status.loc[df.status == 1]) + len(df.status.loc[df.status == 0])), 3))

Alcoholic subjects: 77
Control subjects: 45
Proportion of alcoholic subjects: 0.631


> The dataset is split into training and test sets, then the KerasClassifier module is used in order to set up a pipeline for a grid search across hyperparameters using cross validation.

In [4]:
y = df.status
X = df.drop(['subject', 'status'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234, stratify=df.status)
print(len(X_train), len(X_test), len(y_train), len(y_test))

91 31 91 31


In [5]:
def create_model(neurons1=100, neurons2=30, dropout1=0, dropout2=0, constraint=0):
    model = Sequential()
    model.add(Dense(neurons1, input_dim=150, activation='relu', kernel_constraint=max_norm(constraint)))
    model.add(Dropout(dropout1))
    model.add(Dense(neurons2, activation='relu', kernel_constraint=max_norm(constraint)))
    model.add(Dropout(dropout2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [6]:
pipeline = Pipeline([('scaler', StandardScaler()), ('nn', KerasClassifier(build_fn=create_model, verbose=0))])

> A number of models are then created using GridSearchCV or RandomizedSearchCV and tested on the test set. The choice of hyperparameters is then refined each time and the most favourable model is saved. (Note that this process is quite time-consuming and was therefore run over a number of sessions, as is apparent from the output numbers.)

In [18]:
# MODEL 1
np.random.seed(12)
batch_size = [10, 20, 40]
epochs = [10, 20, 40, 70, 100]
neurons1 = [50, 100, 150, 200, 250]
neurons2 = [10, 20, 30, 40, 50]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2)
clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 375 candidates, totalling 1125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 1125 out of 1125 | elapsed:  7.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('nn',
                                        <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x0000015DB3E229C8>)],
                                verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'nn__batch_size': [10, 20, 40],
                         'nn__epochs': [10, 20, 40, 70, 100],
                         'nn__neurons1': [50, 100, 150, 200, 250],
                         'nn__neurons2': [10, 20, 30, 40, 50]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [20]:
y_pred = (clf.predict(X_test) > 0.5).astype(int)
print(clf.best_score_, clf.best_params_)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7591397960980734 {'nn__batch_size': 40, 'nn__epochs': 10, 'nn__neurons1': 50, 'nn__neurons2': 50}
[[ 8  3]
 [ 4 16]]
              precision    recall  f1-score   support

           0       0.67      0.73      0.70        11
           1       0.84      0.80      0.82        20

    accuracy                           0.77        31
   macro avg       0.75      0.76      0.76        31
weighted avg       0.78      0.77      0.78        31



In [22]:
# MODEL 2
np.random.seed(12)
batch_size = [10, 20, 40]
epochs = [10, 20, 40]
neurons1 = [50, 100, 150, 200, 250]
neurons2 = [30, 40, 50, 60]
dropout1 = [0, 0.1, 0.2]
dropout2 = [0, 0.1, 0.2]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2)
clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=6, verbose=1)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 1620 candidates, totalling 4860 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   24.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  3.1min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  5.3min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  8.3min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed: 11.6min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed: 15.7min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 20.2min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 25.2min
[Parallel(n_jobs=6)]: Done 4860 out of 4860 | elapsed: 30.2min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('nn',
                                        <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x0000015DB3E229C8>)],
                                verbose=False),
             iid='deprecated', n_jobs=6,
             param_grid={'nn__batch_size': [10, 20, 40],
                         'nn__dropout1': [0, 0.1, 0.2],
                         'nn__dropout2': [0, 0.1, 0.2],
                         'nn__epochs': [10, 20, 40],
                         'nn__neurons1': [50, 100, 150, 200, 250],
                         'nn__neurons2': [30, 40, 50, 60]},
             pre_dispatch='2*n_jobs', re

In [23]:
y_pred = (clf.predict(X_test) > 0.5).astype(int)
print(clf.best_score_, clf.best_params_)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7802867293357849 {'nn__batch_size': 20, 'nn__dropout1': 0, 'nn__dropout2': 0.2, 'nn__epochs': 10, 'nn__neurons1': 50, 'nn__neurons2': 50}
[[ 8  3]
 [ 7 13]]
              precision    recall  f1-score   support

           0       0.53      0.73      0.62        11
           1       0.81      0.65      0.72        20

    accuracy                           0.68        31
   macro avg       0.67      0.69      0.67        31
weighted avg       0.71      0.68      0.68        31



In [15]:
# MODEL 3
np.random.seed(12)
batch_size = [10, 20, 30, 40]
epochs = [8, 10, 12, 14, 16, 18, 20]
neurons1 = [50, 75, 100, 125, 150, 175, 200]
neurons2 = [30, 40, 50, 60]
dropout1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
dropout2 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
constraint = [0, 1, 2, 3, 4, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
clf = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=3, n_jobs=3, verbose=1, n_iter=1000)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   16.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   51.4s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.2min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.9min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  7.1min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.6min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.8min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('nn',
                                              <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001F116E1F108>)],
                                      verbose=False),
                   iid='deprecated', n_iter=1000, n_jobs=3,
                   param_distributions={'nn__batch_size': [10, 20, 30, 40],
                                        'nn__constraint': [0, 1, 2, 3, 4, 5],
                                        'nn__dropout1': [0, 0.1, 0.2, 0.3, 0.4,
                                                         0.5],
                   

In [16]:
y_pred = (clf.predict(X_test) > 0.5).astype(int)
print(clf.best_score_, clf.best_params_)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
0.7806451519330343 {'nn__neurons2': 30, 'nn__neurons1': 100, 'nn__epochs': 8, 'nn__dropout2': 0.4, 'nn__dropout1': 0.4, 'nn__constraint': 4, 'nn__batch_size': 40}
[[ 8  3]
 [ 5 15]]
              precision    recall  f1-score   support

           0       0.62      0.73      0.67        11
           1       0.83      0.75      0.79        20

    accuracy                           0.74        31
   macro avg       0.72      0.74      0.73        31
weighted avg       0.76      0.74      0.75        31



In [17]:
# MODEL 4
np.random.seed(12)
batch_size = [30, 35, 40, 45, 50]
epochs = [6, 7, 8, 9, 10, 11, 12]
neurons1 = [25, 37, 50, 75, 100]
neurons2 = [30, 40, 50, 60]
dropout1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
dropout2 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
constraint = [0, 1, 2, 3, 4, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
clf = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=3, n_jobs=3, verbose=1, n_iter=1000, random_state=1)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   11.6s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   44.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.1min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  5.1min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  7.3min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.7min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.9min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('nn',
                                              <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001F116E1F108>)],
                                      verbose=False),
                   iid='deprecated', n_iter=1000, n_jobs=3,
                   param_distributions={'nn__batch_size': [30, 35, 40, 45, 50],
                                        'nn__constraint': [0, 1, 2, 3, 4, 5],
                                        'nn__dropout1': [0, 0.1, 0.2, 0.3, 0.4,
                                                         0.5],
               

In [18]:
y_pred = (clf.predict(X_test) > 0.5).astype(int)
print(clf.best_score_, clf.best_params_)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8139784932136536 {'nn__neurons2': 50, 'nn__neurons1': 75, 'nn__epochs': 11, 'nn__dropout2': 0.4, 'nn__dropout1': 0.5, 'nn__constraint': 3, 'nn__batch_size': 45}
[[ 8  3]
 [ 4 16]]
              precision    recall  f1-score   support

           0       0.67      0.73      0.70        11
           1       0.84      0.80      0.82        20

    accuracy                           0.77        31
   macro avg       0.75      0.76      0.76        31
weighted avg       0.78      0.77      0.78        31



In [19]:
fitted_dl_model = clf.best_estimator_

In [20]:
# MODEL 5
np.random.seed(12)
batch_size = [40, 45, 50]
epochs = [10, 11, 12]
neurons1 = [67, 75, 83]
neurons2 = [45, 50, 55]
dropout1 = [0.4, 0.5]
dropout2 = [0.4, 0.5]
constraint = [3, 4]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=6, verbose=1)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   11.4s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   33.6s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  2.4min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  3.9min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:  5.5min
[Parallel(n_jobs=6)]: Done 1944 out of 1944 | elapsed:  5.9min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('nn',
                                        <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001F116E1F108>)],
                                verbose=False),
             iid='deprecated', n_jobs=6,
             param_grid={'nn__batch_size': [40, 45, 50],
                         'nn__constraint': [3, 4], 'nn__dropout1': [0.4, 0.5],
                         'nn__dropout2': [0.4, 0.5], 'nn__epochs': [10, 11, 12],
                         'nn__neurons1': [67, 75, 83],
                         'nn__neurons2': [45, 50, 55]},
             pre_dispatch='2*n_jobs', refit=True, return_trai

In [21]:
y_pred = (clf.predict(X_test) > 0.5).astype(int)
print(clf.best_score_, clf.best_params_)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8035842378934225 {'nn__batch_size': 45, 'nn__constraint': 3, 'nn__dropout1': 0.5, 'nn__dropout2': 0.4, 'nn__epochs': 10, 'nn__neurons1': 75, 'nn__neurons2': 55}
[[ 6  5]
 [ 3 17]]
              precision    recall  f1-score   support

           0       0.67      0.55      0.60        11
           1       0.77      0.85      0.81        20

    accuracy                           0.74        31
   macro avg       0.72      0.70      0.70        31
weighted avg       0.74      0.74      0.74        31



> Model 1 and model 4 have produced the best results, so as model 4 has already been stored in a variable this will be saved for later use.
> 
> The entire model is not suitable for saving in a single file with either the Pickle or Joblib libraries, so it is split into two parts and saved as two files. The process for saving it this way and subsequently retrieving it is modified from https://prodevsblog.com/questions/125081/how-to-save-a-scikit-learn-pipline-with-keras-regressor-inside-to-disk/.

In [30]:
fitted_dl_model.named_steps['nn'].model.save('eeg_dl_model.h5')
fitted_dl_model.named_steps['nn'].model = None
joblib.dump(fitted_dl_model, 'eeg_dl_model.pkl')

['eeg_dl_model.pkl']

> **Nested cross validation model**
>
> Nested cross validation is applied, as per the previous notebook.

In [7]:
# PROCESS 1
np.random.seed(12)
batch_size = [30, 35, 40, 45, 50]
epochs = [6, 7, 8, 9, 10, 11, 12]
neurons1 = [25, 37, 50, 75, 100]
neurons2 = [30, 40, 50, 60]
dropout1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
dropout2 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
constraint = [0, 1, 2, 3, 4, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
cv_outer = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
outer_results = list()
for train_ix, test_ix in cv_outer.split(X, y):
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    clf = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=3, n_jobs=3, verbose=1, n_iter=1000, random_state=1)
    clf.fit(X_train, y_train)
    y_pred = (clf.predict(X_test) > 0.5).astype(int)
    print("\n", clf.best_score_, clf.best_params_)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    outer_results.append({'f1': f1_score(y_test, y_pred), 'sensitivity': recall_score(y_test, y_pred),
                          'specificity': recall_score(y_test, y_pred, pos_label=0), 'PPV': precision_score(y_test, y_pred),
                          'NPV': precision_score(y_test, y_pred, pos_label=0)})
print("\nMean scores:")
for score in ['f1', 'sensitivity', 'specificity', 'PPV', 'NPV']:
    print(" ", score, ":", np.array([dict[score] for dict in outer_results]).mean())

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   15.1s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   47.8s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.6min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.0min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.0min finished


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).

 0.7154121994972229 {'nn__neurons2': 50, 'nn__neurons1': 25, 'nn__epochs': 7, 'nn__dropout2': 0, 'nn__dropout1': 0.5, 'nn__constraint': 5, 'nn__batch_size': 45}
[[ 4  7]
 [ 2 18]]
              precision    recall  f1-score   support

           0       0.67      0.36      0.47        11
           1       0.72      0.90      0.80        20

    accuracy                           0.71        31
   macro avg       0.69      0.63      0.64        31
weighted avg       0.70      0.71      0.68        31

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.3s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   41.0s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  2.8min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.7min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.2min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.4min finished



 0.7448028723398844 {'nn__neurons2': 40, 'nn__neurons1': 50, 'nn__epochs': 6, 'nn__dropout2': 0.3, 'nn__dropout1': 0.5, 'nn__constraint': 2, 'nn__batch_size': 45}
[[ 8  4]
 [ 1 18]]
              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.82      0.95      0.88        19

    accuracy                           0.84        31
   macro avg       0.85      0.81      0.82        31
weighted avg       0.85      0.84      0.83        31

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   10.7s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   44.7s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.1min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.8min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.9min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.4min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.5min finished



 0.7279569904009501 {'nn__neurons2': 30, 'nn__neurons1': 25, 'nn__epochs': 9, 'nn__dropout2': 0, 'nn__dropout1': 0.1, 'nn__constraint': 4, 'nn__batch_size': 30}
[[ 4  7]
 [ 3 16]]
              precision    recall  f1-score   support

           0       0.57      0.36      0.44        11
           1       0.70      0.84      0.76        19

    accuracy                           0.67        30
   macro avg       0.63      0.60      0.60        30
weighted avg       0.65      0.67      0.65        30

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   10.2s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   45.6s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.1min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.9min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.9min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.3min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.5min finished



 0.7935483853022257 {'nn__neurons2': 60, 'nn__neurons1': 25, 'nn__epochs': 8, 'nn__dropout2': 0.4, 'nn__dropout1': 0, 'nn__constraint': 1, 'nn__batch_size': 35}
[[ 6  5]
 [ 6 13]]
              precision    recall  f1-score   support

           0       0.50      0.55      0.52        11
           1       0.72      0.68      0.70        19

    accuracy                           0.63        30
   macro avg       0.61      0.61      0.61        30
weighted avg       0.64      0.63      0.64        30


Mean scores:
  f1 : 0.7856640612738174
  sensitivity : 0.8434210526315788
  specificity : 0.48484848484848486
  PPV : 0.739014053579271
  NPV : 0.6567460317460316


In [8]:
# PROCESS 2
np.random.seed(12)
batch_size = [30, 35, 40, 45]
epochs = [6, 7, 8, 9]
neurons1 = [25, 30, 35, 40, 45, 50]
neurons2 = [30, 40, 50, 60]
dropout1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
dropout2 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
constraint = [0, 1, 2, 3, 4, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
cv_outer = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
outer_results = list()
for train_ix, test_ix in cv_outer.split(X, y):
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    clf = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=3, n_jobs=3, verbose=1, n_iter=1000, random_state=1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("\n", clf.best_score_, clf.best_params_)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    outer_results.append({'f1': f1_score(y_test, y_pred), 'sensitivity': recall_score(y_test, y_pred),
                          'specificity': recall_score(y_test, y_pred, pos_label=0), 'PPV': precision_score(y_test, y_pred),
                          'NPV': precision_score(y_test, y_pred, pos_label=0)})
print("\nMean scores:")
for score in ['f1', 'sensitivity', 'specificity', 'PPV', 'NPV']:
    print(" ", score, ":", np.array([dict[score] for dict in outer_results]).mean())

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   12.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   46.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.1min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.8min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.9min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.3min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.4min finished



 0.7580645084381104 {'nn__neurons2': 50, 'nn__neurons1': 35, 'nn__epochs': 8, 'nn__dropout2': 0.3, 'nn__dropout1': 0.3, 'nn__constraint': 2, 'nn__batch_size': 45}
[[ 7  4]
 [ 7 13]]
              precision    recall  f1-score   support

           0       0.50      0.64      0.56        11
           1       0.76      0.65      0.70        20

    accuracy                           0.65        31
   macro avg       0.63      0.64      0.63        31
weighted avg       0.67      0.65      0.65        31

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.7s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   42.7s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.7min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.1min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.2min finished



 0.7240143418312073 {'nn__neurons2': 40, 'nn__neurons1': 35, 'nn__epochs': 9, 'nn__dropout2': 0.3, 'nn__dropout1': 0.4, 'nn__constraint': 4, 'nn__batch_size': 40}
[[ 6  6]
 [ 0 19]]
              precision    recall  f1-score   support

           0       1.00      0.50      0.67        12
           1       0.76      1.00      0.86        19

    accuracy                           0.81        31
   macro avg       0.88      0.75      0.77        31
weighted avg       0.85      0.81      0.79        31

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    9.9s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   47.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.1min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.9min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  7.0min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.7min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.8min finished



 0.7390681107838949 {'nn__neurons2': 40, 'nn__neurons1': 40, 'nn__epochs': 8, 'nn__dropout2': 0.5, 'nn__dropout1': 0.3, 'nn__constraint': 4, 'nn__batch_size': 45}
[[ 6  5]
 [ 3 16]]
              precision    recall  f1-score   support

           0       0.67      0.55      0.60        11
           1       0.76      0.84      0.80        19

    accuracy                           0.73        30
   macro avg       0.71      0.69      0.70        30
weighted avg       0.73      0.73      0.73        30

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   11.2s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   49.6s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.1min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.8min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.8min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.2min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.3min finished



 0.8146953384081522 {'nn__neurons2': 30, 'nn__neurons1': 35, 'nn__epochs': 6, 'nn__dropout2': 0, 'nn__dropout1': 0.1, 'nn__constraint': 2, 'nn__batch_size': 45}
[[ 6  5]
 [ 4 15]]
              precision    recall  f1-score   support

           0       0.60      0.55      0.57        11
           1       0.75      0.79      0.77        19

    accuracy                           0.70        30
   macro avg       0.68      0.67      0.67        30
weighted avg       0.70      0.70      0.70        30


Mean scores:
  f1 : 0.783892458892459
  sensitivity : 0.8203947368421052
  specificity : 0.5568181818181818
  PPV : 0.7591526610644257
  NPV : 0.6916666666666667


In [9]:
# PROCESS 3
np.random.seed(12)
batch_size = [40, 45]
epochs = [6, 7, 8, 9]
neurons1 = [35, 40]
neurons2 = [30, 40, 50]
dropout1 = [0.1, 0.2, 0.3, 0.4]
dropout2 = [0, 0.3, 0.5]
constraint = [2, 3, 4]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
cv_outer = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
outer_results = list()
for train_ix, test_ix in cv_outer.split(X, y):
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=6, verbose=1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("\n", clf.best_score_, clf.best_params_)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    outer_results.append({'f1': f1_score(y_test, y_pred), 'sensitivity': recall_score(y_test, y_pred),
                          'specificity': recall_score(y_test, y_pred, pos_label=0), 'PPV': precision_score(y_test, y_pred),
                          'NPV': precision_score(y_test, y_pred, pos_label=0)})
print("\nMean scores:")
for score in ['f1', 'sensitivity', 'specificity', 'PPV', 'NPV']:
    print(" ", score, ":", np.array([dict[score] for dict in outer_results]).mean())

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   10.8s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   32.3s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  2.2min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  3.6min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:  5.1min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:  6.9min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed:  9.1min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 11.4min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 14.1min
[Parallel(n_jobs=6)]: Done 5184 out of 5184 | elapsed: 14.6min finished



 0.7684587637583414 {'nn__batch_size': 45, 'nn__constraint': 4, 'nn__dropout1': 0.4, 'nn__dropout2': 0.3, 'nn__epochs': 6, 'nn__neurons1': 40, 'nn__neurons2': 50}
[[ 6  5]
 [ 5 15]]
              precision    recall  f1-score   support

           0       0.55      0.55      0.55        11
           1       0.75      0.75      0.75        20

    accuracy                           0.68        31
   macro avg       0.65      0.65      0.65        31
weighted avg       0.68      0.68      0.68        31

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.2s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   30.4s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  2.2min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  3.6min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:  5.2min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:  7.0min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed:  9.1min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 11.5min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 14.2min
[Parallel(n_jobs=6)]: Done 5184 out of 5184 | elapsed: 14.8min finished



 0.8333333333333334 {'nn__batch_size': 45, 'nn__constraint': 3, 'nn__dropout1': 0.3, 'nn__dropout2': 0, 'nn__epochs': 7, 'nn__neurons1': 35, 'nn__neurons2': 30}
[[ 6  6]
 [ 1 18]]
              precision    recall  f1-score   support

           0       0.86      0.50      0.63        12
           1       0.75      0.95      0.84        19

    accuracy                           0.77        31
   macro avg       0.80      0.72      0.73        31
weighted avg       0.79      0.77      0.76        31

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    8.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   34.4s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  2.3min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:  5.1min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:  7.0min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed:  9.1min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 11.6min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 14.2min
[Parallel(n_jobs=6)]: Done 5184 out of 5184 | elapsed: 14.8min finished



 0.792831540107727 {'nn__batch_size': 45, 'nn__constraint': 4, 'nn__dropout1': 0.2, 'nn__dropout2': 0.3, 'nn__epochs': 6, 'nn__neurons1': 35, 'nn__neurons2': 30}
[[ 6  5]
 [ 9 10]]
              precision    recall  f1-score   support

           0       0.40      0.55      0.46        11
           1       0.67      0.53      0.59        19

    accuracy                           0.53        30
   macro avg       0.53      0.54      0.52        30
weighted avg       0.57      0.53      0.54        30

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.9s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   32.9s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  2.2min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:  5.0min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:  6.9min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed:  9.0min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed: 11.4min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed: 14.2min
[Parallel(n_jobs=6)]: Done 5184 out of 5184 | elapsed: 14.7min finished



 0.8043010830879211 {'nn__batch_size': 40, 'nn__constraint': 4, 'nn__dropout1': 0.4, 'nn__dropout2': 0.5, 'nn__epochs': 8, 'nn__neurons1': 40, 'nn__neurons2': 40}
[[ 6  5]
 [ 8 11]]
              precision    recall  f1-score   support

           0       0.43      0.55      0.48        11
           1       0.69      0.58      0.63        19

    accuracy                           0.57        30
   macro avg       0.56      0.56      0.55        30
weighted avg       0.59      0.57      0.57        30


Mean scores:
  f1 : 0.7010040062536643
  sensitivity : 0.700657894736842
  specificity : 0.5340909090909091
  PPV : 0.7135416666666666
  NPV : 0.5577922077922077


> Process 2 has produced the most helpful set of results, in particular the highest positive and negative predictive values (class 1 and 0 precision respectively), so this process is repeated on the full dataset using 3-fold cross validation. The resulting model is then saved as two files.

In [10]:
np.random.seed(12)
batch_size = [30, 35, 40, 45]
epochs = [6, 7, 8, 9]
neurons1 = [25, 30, 35, 40, 45, 50]
neurons2 = [30, 40, 50, 60]
dropout1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
dropout2 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
constraint = [0, 1, 2, 3, 4, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
clf = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=3, n_jobs=3, verbose=1, n_iter=1000, random_state=1)
clf.fit(X, y)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   11.8s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   45.7s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.8min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.9min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.3min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.4min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('nn',
                                              <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001FD7C2C1F08>)],
                                      verbose=False),
                   iid='deprecated', n_iter=1000, n_jobs=3,
                   param_distributions={'nn__batch_size': [30, 35, 40, 45],
                                        'nn__constraint': [0, 1, 2, 3, 4, 5],
                                        'nn__dropout1': [0, 0.1, 0.2, 0.3, 0.4,
                                                         0.5],
                   

In [11]:
print(clf.best_score_, clf.best_params_)
fitted_dl_model_ncv = clf.best_estimator_

0.7697154482205709 {'nn__neurons2': 30, 'nn__neurons1': 50, 'nn__epochs': 6, 'nn__dropout2': 0.5, 'nn__dropout1': 0, 'nn__constraint': 3, 'nn__batch_size': 45}


In [12]:
fitted_dl_model_ncv.named_steps['nn'].model.save('eeg_dl_model_ncv.h5')
fitted_dl_model_ncv.named_steps['nn'].model = None
joblib.dump(fitted_dl_model_ncv, 'eeg_dl_model_ncv.pkl')

['eeg_dl_model_ncv.pkl']

> **WITHOUT ABSOLUTE POWERS**
>
> A similar set of models is then produced using a version of the dataset without absolute powers.

In [5]:
y = df.status
X = df.drop(['subject', 'Fp1a delta', 'Fp1a theta', 'Fp1a alpha', 'Fp1a beta',
       'Fp1a gamma', 'Fp2a delta', 'Fp2a theta', 'Fp2a alpha',
       'Fp2a beta', 'Fp2a gamma', 'F3a delta', 'F3a theta', 'F3a alpha',
       'F3a beta', 'F3a gamma', 'F4a delta', 'F4a theta', 'F4a alpha',
       'F4a beta', 'F4a gamma', 'F7a delta', 'F7a theta', 'F7a alpha',
       'F7a beta', 'F7a gamma', 'F8a delta', 'F8a theta', 'F8a alpha',
       'F8a beta', 'F8a gamma', 'C3a delta', 'C3a theta', 'C3a alpha',
       'C3a beta', 'C3a gamma', 'C4a delta', 'C4a theta', 'C4a alpha',
       'C4a beta', 'C4a gamma', 'P3a delta', 'P3a theta', 'P3a alpha',
       'P3a beta', 'P3a gamma', 'P4a delta', 'P4a theta', 'P4a alpha',
       'P4a beta', 'P4a gamma', 'O1a delta', 'O1a theta', 'O1a alpha',
       'O1a beta', 'O1a gamma', 'O2a delta', 'O2a theta', 'O2a alpha',
       'O2a beta', 'O2a gamma', 'status'], axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234, stratify=df.status)

In [7]:
def create_model(neurons1=100, neurons2=30, dropout1=0, dropout2=0, constraint=0):
    model = Sequential()
    model.add(Dense(neurons1, input_dim=90, activation='relu', kernel_constraint=max_norm(constraint)))
    model.add(Dropout(dropout1))
    model.add(Dense(neurons2, activation='relu', kernel_constraint=max_norm(constraint)))
    model.add(Dropout(dropout2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

pipeline = Pipeline([('scaler', StandardScaler()), ('nn', KerasClassifier(build_fn=create_model, verbose=0))])

In [19]:
# MODEL 1
np.random.seed(12)
batch_size = [30, 35, 40, 45, 50]
epochs = [6, 7, 8, 9, 10, 11, 12]
neurons1 = [25, 37, 50, 75, 100]
neurons2 = [30, 40, 50, 60]
dropout1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
dropout2 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
constraint = [0, 1, 2, 3, 4, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
clf = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=3, n_jobs=3, verbose=1, n_iter=1000)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   12.2s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   44.7s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.8min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.3min
[Parallel(n_jobs=3)]: Done 3000 out of 3000 | elapsed: 11.4min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('nn',
                                              <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001FD7F81BA08>)],
                                      verbose=False),
                   iid='deprecated', n_iter=1000, n_jobs=3,
                   param_distributions={'nn__batch_size': [30, 35, 40, 45, 50],
                                        'nn__constraint': [0, 1, 2, 3, 4, 5],
                                        'nn__dropout1': [0, 0.1, 0.2, 0.3, 0.4,
                                                         0.5],
               

In [20]:
y_pred = (clf.predict(X_test) > 0.5).astype(int)
print(clf.best_score_, clf.best_params_)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7265233000119528 {'nn__neurons2': 50, 'nn__neurons1': 37, 'nn__epochs': 9, 'nn__dropout2': 0.1, 'nn__dropout1': 0.5, 'nn__constraint': 5, 'nn__batch_size': 30}
[[ 5  6]
 [ 3 17]]
              precision    recall  f1-score   support

           0       0.62      0.45      0.53        11
           1       0.74      0.85      0.79        20

    accuracy                           0.71        31
   macro avg       0.68      0.65      0.66        31
weighted avg       0.70      0.71      0.70        31



In [21]:
fitted_dl_model_no_ap = clf.best_estimator_

In [22]:
# MODEL 2
np.random.seed(12)
batch_size = [25, 30, 35]
epochs = [8, 9, 10]
neurons1 = [31, 37, 43]
neurons2 = [45, 50, 55]
dropout1 = [0.4, 0.5]
dropout2 = [0, 0.1, 0.2]
constraint = [4, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=3, verbose=1)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 972 candidates, totalling 2916 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   12.5s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   46.2s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.7min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.3min
[Parallel(n_jobs=3)]: Done 2916 out of 2916 | elapsed: 11.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('nn',
                                        <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001FD7F81BA08>)],
                                verbose=False),
             iid='deprecated', n_jobs=3,
             param_grid={'nn__batch_size': [25, 30, 35],
                         'nn__constraint': [4, 5], 'nn__dropout1': [0.4, 0.5],
                         'nn__dropout2': [0, 0.1, 0.2],
                         'nn__epochs': [8, 9, 10], 'nn__neurons1': [31, 37, 43],
                         'nn__neurons2': [45, 50, 55]},
             pre_dispatch='2*n_jobs', refit=True, return_tra

In [23]:
y_pred = (clf.predict(X_test) > 0.5).astype(int)
print(clf.best_score_, clf.best_params_)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7591397762298584 {'nn__batch_size': 35, 'nn__constraint': 4, 'nn__dropout1': 0.4, 'nn__dropout2': 0.1, 'nn__epochs': 8, 'nn__neurons1': 37, 'nn__neurons2': 45}
[[ 6  5]
 [ 6 14]]
              precision    recall  f1-score   support

           0       0.50      0.55      0.52        11
           1       0.74      0.70      0.72        20

    accuracy                           0.65        31
   macro avg       0.62      0.62      0.62        31
weighted avg       0.65      0.65      0.65        31



> Model 1 has produced the more clinically useful set of results so this one is saved.

In [24]:
fitted_dl_model_no_ap.named_steps['nn'].model.save('eeg_dl_model_no_ap.h5')
fitted_dl_model_no_ap.named_steps['nn'].model = None
joblib.dump(fitted_dl_model_no_ap, 'eeg_dl_model_no_ap.pkl')

['eeg_dl_model_no_ap.pkl']

> **Nested cross validation model**

In [28]:
# PROCESS 1
np.random.seed(12)
batch_size = [30, 35, 40, 45, 50]
epochs = [6, 7, 8, 9, 10, 11, 12]
neurons1 = [25, 37, 50, 75, 100]
neurons2 = [30, 40, 50, 60]
dropout1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
dropout2 = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
constraint = [0, 1, 2, 3, 4, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
cv_outer = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
outer_results = list()
for train_ix, test_ix in cv_outer.split(X, y):
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    clf = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=3, n_jobs=5, verbose=1, n_iter=1000, random_state=1)
    clf.fit(X_train, y_train)
    y_pred = (clf.predict(X_test) > 0.5).astype(int)
    print("\n", clf.best_score_, clf.best_params_)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    outer_results.append({'f1': f1_score(y_test, y_pred), 'sensitivity': recall_score(y_test, y_pred),
                          'specificity': recall_score(y_test, y_pred, pos_label=0), 'PPV': precision_score(y_test, y_pred),
                          'NPV': precision_score(y_test, y_pred, pos_label=0)})
print("\nMean scores:")
for score in ['f1', 'sensitivity', 'specificity', 'PPV', 'NPV']:
    print(" ", score, ":", np.array([dict[score] for dict in outer_results]).mean())

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   10.6s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:   34.5s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:  1.4min
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:  2.6min
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed:  4.3min
[Parallel(n_jobs=5)]: Done 1790 tasks      | elapsed:  6.4min
[Parallel(n_jobs=5)]: Done 2440 tasks      | elapsed:  8.8min
[Parallel(n_jobs=5)]: Done 3000 out of 3000 | elapsed: 11.0min finished



 0.6594982147216797 {'nn__neurons2': 30, 'nn__neurons1': 25, 'nn__epochs': 6, 'nn__dropout2': 0.2, 'nn__dropout1': 0.3, 'nn__constraint': 3, 'nn__batch_size': 35}
[[ 4  7]
 [ 6 14]]
              precision    recall  f1-score   support

           0       0.40      0.36      0.38        11
           1       0.67      0.70      0.68        20

    accuracy                           0.58        31
   macro avg       0.53      0.53      0.53        31
weighted avg       0.57      0.58      0.58        31

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    7.8s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:   34.6s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:  1.4min
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:  2.5min
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed:  4.0min
[Parallel(n_jobs=5)]: Done 1790 tasks      | elapsed:  5.6min
[Parallel(n_jobs=5)]: Done 2440 tasks      | elapsed:  7.6min
[Parallel(n_jobs=5)]: Done 3000 out of 3000 | elapsed:  9.3min finished



 0.7347670197486877 {'nn__neurons2': 40, 'nn__neurons1': 25, 'nn__epochs': 7, 'nn__dropout2': 0.2, 'nn__dropout1': 0, 'nn__constraint': 2, 'nn__batch_size': 30}
[[ 2 10]
 [ 0 19]]
              precision    recall  f1-score   support

           0       1.00      0.17      0.29        12
           1       0.66      1.00      0.79        19

    accuracy                           0.68        31
   macro avg       0.83      0.58      0.54        31
weighted avg       0.79      0.68      0.60        31

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    8.7s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:   36.0s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:  1.4min
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:  2.4min
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed:  3.8min
[Parallel(n_jobs=5)]: Done 1790 tasks      | elapsed:  5.5min
[Parallel(n_jobs=5)]: Done 2440 tasks      | elapsed:  7.5min
[Parallel(n_jobs=5)]: Done 3000 out of 3000 | elapsed:  9.2min finished



 0.716487447420756 {'nn__neurons2': 30, 'nn__neurons1': 25, 'nn__epochs': 6, 'nn__dropout2': 0.5, 'nn__dropout1': 0, 'nn__constraint': 5, 'nn__batch_size': 30}
[[ 4  7]
 [ 6 13]]
              precision    recall  f1-score   support

           0       0.40      0.36      0.38        11
           1       0.65      0.68      0.67        19

    accuracy                           0.57        30
   macro avg       0.53      0.52      0.52        30
weighted avg       0.56      0.57      0.56        30

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    7.1s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:   35.1s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:  1.3min
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:  2.4min
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed:  3.8min
[Parallel(n_jobs=5)]: Done 1790 tasks      | elapsed:  5.4min
[Parallel(n_jobs=5)]: Done 2440 tasks      | elapsed:  7.3min
[Parallel(n_jobs=5)]: Done 3000 out of 3000 | elapsed:  9.2min finished



 0.6724014480908712 {'nn__neurons2': 30, 'nn__neurons1': 25, 'nn__epochs': 11, 'nn__dropout2': 0.2, 'nn__dropout1': 0.3, 'nn__constraint': 5, 'nn__batch_size': 45}
[[ 4  7]
 [ 9 10]]
              precision    recall  f1-score   support

           0       0.31      0.36      0.33        11
           1       0.59      0.53      0.56        19

    accuracy                           0.47        30
   macro avg       0.45      0.44      0.44        30
weighted avg       0.49      0.47      0.47        30


Mean scores:
  f1 : 0.6742039295392954
  sensitivity : 0.7276315789473684
  specificity : 0.3143939393939394
  PPV : 0.6400185936443543
  NPV : 0.5269230769230768


In [29]:
# PROCESS 2
batch_size = [30, 37, 45]
epochs = [6, 7]
neurons1 = [20, 25]
neurons2 = [20, 30, 40]
dropout1 = [0, 0.3]
dropout2 = [0.2, 0.5]
constraint = [2, 3, 5]
np.random.seed(12)
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
cv_outer = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
outer_results = list()
for train_ix, test_ix in cv_outer.split(X, y):
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=3, verbose=1)
    clf.fit(X_train, y_train)
    y_pred = (clf.predict(X_test) > 0.5).astype(int)
    print("\n", clf.best_score_, clf.best_params_)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    outer_results.append({'f1': f1_score(y_test, y_pred), 'sensitivity': recall_score(y_test, y_pred),
                          'specificity': recall_score(y_test, y_pred, pos_label=0), 'PPV': precision_score(y_test, y_pred),
                          'NPV': precision_score(y_test, y_pred, pos_label=0)})
print("\nMean scores:")
for score in ['f1', 'sensitivity', 'specificity', 'PPV', 'NPV']:
    print(" ", score, ":", np.array([dict[score] for dict in outer_results]).mean())

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   12.1s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   43.7s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 1296 out of 1296 | elapsed:  4.8min finished



 0.7792114814122518 {'nn__batch_size': 45, 'nn__constraint': 5, 'nn__dropout1': 0.3, 'nn__dropout2': 0.2, 'nn__epochs': 6, 'nn__neurons1': 20, 'nn__neurons2': 40}
[[ 8  3]
 [ 9 11]]
              precision    recall  f1-score   support

           0       0.47      0.73      0.57        11
           1       0.79      0.55      0.65        20

    accuracy                           0.61        31
   macro avg       0.63      0.64      0.61        31
weighted avg       0.67      0.61      0.62        31

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.4s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   40.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 1296 out of 1296 | elapsed:  4.8min finished



 0.767025093237559 {'nn__batch_size': 37, 'nn__constraint': 5, 'nn__dropout1': 0, 'nn__dropout2': 0.5, 'nn__epochs': 7, 'nn__neurons1': 25, 'nn__neurons2': 20}
[[ 2 10]
 [ 1 18]]
              precision    recall  f1-score   support

           0       0.67      0.17      0.27        12
           1       0.64      0.95      0.77        19

    accuracy                           0.65        31
   macro avg       0.65      0.56      0.52        31
weighted avg       0.65      0.65      0.57        31

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   11.7s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   44.7s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 1296 out of 1296 | elapsed:  4.9min finished



 0.7483870983123779 {'nn__batch_size': 37, 'nn__constraint': 2, 'nn__dropout1': 0, 'nn__dropout2': 0.5, 'nn__epochs': 6, 'nn__neurons1': 20, 'nn__neurons2': 20}
[[ 2  9]
 [ 3 16]]
              precision    recall  f1-score   support

           0       0.40      0.18      0.25        11
           1       0.64      0.84      0.73        19

    accuracy                           0.60        30
   macro avg       0.52      0.51      0.49        30
weighted avg       0.55      0.60      0.55        30

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.6s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   41.7s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 1296 out of 1296 | elapsed:  4.8min finished



 0.7931899627049764 {'nn__batch_size': 45, 'nn__constraint': 5, 'nn__dropout1': 0.3, 'nn__dropout2': 0.5, 'nn__epochs': 6, 'nn__neurons1': 20, 'nn__neurons2': 30}
[[ 5  6]
 [ 4 15]]
              precision    recall  f1-score   support

           0       0.56      0.45      0.50        11
           1       0.71      0.79      0.75        19

    accuracy                           0.67        30
   macro avg       0.63      0.62      0.62        30
weighted avg       0.66      0.67      0.66        30


Mean scores:
  f1 : 0.7225722494026624
  sensitivity : 0.7822368421052632
  specificity : 0.38257575757575757
  PPV : 0.6957142857142857
  NPV : 0.5232026143790849


In [30]:
# PROCESS 3
np.random.seed(12)
batch_size = [45]
epochs = [6, 7, 8, 9, 10, 11, 12]
neurons1 = [20, 25, 30, 35, 40]
neurons2 = [20, 30, 40]
dropout1 = [0.3]
dropout2 = [0.5]
constraint = [5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
cv_outer = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
outer_results = list()
for train_ix, test_ix in cv_outer.split(X, y):
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=3, verbose=1)
    clf.fit(X_train, y_train)
    y_pred = (clf.predict(X_test) > 0.5).astype(int)
    print("\n", clf.best_score_, clf.best_params_)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    outer_results.append({'f1': f1_score(y_test, y_pred), 'sensitivity': recall_score(y_test, y_pred),
                          'specificity': recall_score(y_test, y_pred, pos_label=0), 'PPV': precision_score(y_test, y_pred),
                          'NPV': precision_score(y_test, y_pred, pos_label=0)})
print("\nMean scores:")
for score in ['f1', 'sensitivity', 'specificity', 'PPV', 'NPV']:
    print(" ", score, ":", np.array([dict[score] for dict in outer_results]).mean())

Fitting 3 folds for each of 105 candidates, totalling 315 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   12.2s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   45.8s
[Parallel(n_jobs=3)]: Done 315 out of 315 | elapsed:  1.2min finished



 0.7573476632436117 {'nn__batch_size': 45, 'nn__constraint': 5, 'nn__dropout1': 0.3, 'nn__dropout2': 0.5, 'nn__epochs': 7, 'nn__neurons1': 30, 'nn__neurons2': 20}
[[11  0]
 [17  3]]
              precision    recall  f1-score   support

           0       0.39      1.00      0.56        11
           1       1.00      0.15      0.26        20

    accuracy                           0.45        31
   macro avg       0.70      0.57      0.41        31
weighted avg       0.78      0.45      0.37        31

Fitting 3 folds for each of 105 candidates, totalling 315 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.6s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   41.8s
[Parallel(n_jobs=3)]: Done 315 out of 315 | elapsed:  1.2min finished



 0.7666666706403097 {'nn__batch_size': 45, 'nn__constraint': 5, 'nn__dropout1': 0.3, 'nn__dropout2': 0.5, 'nn__epochs': 6, 'nn__neurons1': 35, 'nn__neurons2': 40}
[[ 6  6]
 [ 4 15]]
              precision    recall  f1-score   support

           0       0.60      0.50      0.55        12
           1       0.71      0.79      0.75        19

    accuracy                           0.68        31
   macro avg       0.66      0.64      0.65        31
weighted avg       0.67      0.68      0.67        31

Fitting 3 folds for each of 105 candidates, totalling 315 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    9.9s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   43.2s
[Parallel(n_jobs=3)]: Done 315 out of 315 | elapsed:  1.2min finished



 0.6645161310831705 {'nn__batch_size': 45, 'nn__constraint': 5, 'nn__dropout1': 0.3, 'nn__dropout2': 0.5, 'nn__epochs': 7, 'nn__neurons1': 20, 'nn__neurons2': 40}
[[ 4  7]
 [ 5 14]]
              precision    recall  f1-score   support

           0       0.44      0.36      0.40        11
           1       0.67      0.74      0.70        19

    accuracy                           0.60        30
   macro avg       0.56      0.55      0.55        30
weighted avg       0.59      0.60      0.59        30

Fitting 3 folds for each of 105 candidates, totalling 315 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   11.4s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   45.1s
[Parallel(n_jobs=3)]: Done 315 out of 315 | elapsed:  1.3min finished



 0.7172042926152548 {'nn__batch_size': 45, 'nn__constraint': 5, 'nn__dropout1': 0.3, 'nn__dropout2': 0.5, 'nn__epochs': 8, 'nn__neurons1': 25, 'nn__neurons2': 30}
[[ 2  9]
 [ 4 15]]
              precision    recall  f1-score   support

           0       0.33      0.18      0.24        11
           1       0.62      0.79      0.70        19

    accuracy                           0.57        30
   macro avg       0.48      0.49      0.47        30
weighted avg       0.52      0.57      0.53        30


Mean scores:
  f1 : 0.6021359959555106
  sensitivity : 0.6164473684210526
  specificity : 0.5113636363636364
  PPV : 0.7514880952380952
  NPV : 0.44265873015873014


In [10]:
# PROCESS 4
np.random.seed(12)
batch_size = [30, 35, 40, 45]
epochs = [6, 7, 8, 9]
neurons1 = [20, 25, 30, 35]
neurons2 = [20, 30, 40, 50]
dropout1 = [0, 0.3]
dropout2 = [0.2, 0.5]
constraint = [2, 3, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
cv_outer = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
outer_results = list()
for train_ix, test_ix in cv_outer.split(X, y):
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=3, verbose=1)
    clf.fit(X_train, y_train)
    y_pred = (clf.predict(X_test) > 0.5).astype(int)
    print("\n", clf.best_score_, clf.best_params_)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    outer_results.append({'f1': f1_score(y_test, y_pred), 'sensitivity': recall_score(y_test, y_pred),
                          'specificity': recall_score(y_test, y_pred, pos_label=0), 'PPV': precision_score(y_test, y_pred),
                          'NPV': precision_score(y_test, y_pred, pos_label=0)})
print("\nMean scores:")
for score in ['f1', 'sensitivity', 'specificity', 'PPV', 'NPV']:
    print(" ", score, ":", np.array([dict[score] for dict in outer_results]).mean())

Fitting 3 folds for each of 3072 candidates, totalling 9216 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   12.7s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   48.2s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.7min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.1min
[Parallel(n_jobs=3)]: Done 3194 tasks      | elapsed: 11.8min
[Parallel(n_jobs=3)]: Done 4044 tasks      | elapsed: 15.0min
[Parallel(n_jobs=3)]: Done 4994 tasks      | elapsed: 18.5min
[Parallel(n_jobs=3)]: Done 6044 tasks      | elapsed: 22.4min
[Parallel(n_jobs=3)]: Done 7194 tasks      | elapsed: 26.6min
[Parallel(n_jobs=3)]: Done 8444 tasks      | elapsed: 31.3min
[Parallel(n_jobs=3)]: Done 9216 out of 9216 | elapsed: 34.1min finished



 0.7777777910232544 {'nn__batch_size': 35, 'nn__constraint': 3, 'nn__dropout1': 0.3, 'nn__dropout2': 0.5, 'nn__epochs': 6, 'nn__neurons1': 20, 'nn__neurons2': 30}
[[ 2  9]
 [ 0 20]]
              precision    recall  f1-score   support

           0       1.00      0.18      0.31        11
           1       0.69      1.00      0.82        20

    accuracy                           0.71        31
   macro avg       0.84      0.59      0.56        31
weighted avg       0.80      0.71      0.64        31

Fitting 3 folds for each of 3072 candidates, totalling 9216 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   11.6s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   43.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.5min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.5min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  8.8min
[Parallel(n_jobs=3)]: Done 3194 tasks      | elapsed: 11.6min
[Parallel(n_jobs=3)]: Done 4044 tasks      | elapsed: 14.8min
[Parallel(n_jobs=3)]: Done 4994 tasks      | elapsed: 18.4min
[Parallel(n_jobs=3)]: Done 6044 tasks      | elapsed: 22.3min
[Parallel(n_jobs=3)]: Done 7194 tasks      | elapsed: 26.5min
[Parallel(n_jobs=3)]: Done 8444 tasks      | elapsed: 31.3min
[Parallel(n_jobs=3)]: Done 9216 out of 9216 | elapsed: 34.3min finished



 0.7781361937522888 {'nn__batch_size': 30, 'nn__constraint': 3, 'nn__dropout1': 0, 'nn__dropout2': 0.5, 'nn__epochs': 7, 'nn__neurons1': 25, 'nn__neurons2': 50}
[[ 3  9]
 [ 1 18]]
              precision    recall  f1-score   support

           0       0.75      0.25      0.38        12
           1       0.67      0.95      0.78        19

    accuracy                           0.68        31
   macro avg       0.71      0.60      0.58        31
weighted avg       0.70      0.68      0.62        31

Fitting 3 folds for each of 3072 candidates, totalling 9216 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.5s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   43.6s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  6.7min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.2min
[Parallel(n_jobs=3)]: Done 3194 tasks      | elapsed: 12.0min
[Parallel(n_jobs=3)]: Done 4044 tasks      | elapsed: 15.2min
[Parallel(n_jobs=3)]: Done 4994 tasks      | elapsed: 18.7min
[Parallel(n_jobs=3)]: Done 6044 tasks      | elapsed: 22.7min
[Parallel(n_jobs=3)]: Done 7194 tasks      | elapsed: 27.0min
[Parallel(n_jobs=3)]: Done 8444 tasks      | elapsed: 31.7min
[Parallel(n_jobs=3)]: Done 9216 out of 9216 | elapsed: 34.6min finished



 0.7720430095990499 {'nn__batch_size': 45, 'nn__constraint': 5, 'nn__dropout1': 0.3, 'nn__dropout2': 0.2, 'nn__epochs': 8, 'nn__neurons1': 20, 'nn__neurons2': 20}
[[ 7  4]
 [15  4]]
              precision    recall  f1-score   support

           0       0.32      0.64      0.42        11
           1       0.50      0.21      0.30        19

    accuracy                           0.37        30
   macro avg       0.41      0.42      0.36        30
weighted avg       0.43      0.37      0.34        30

Fitting 3 folds for each of 3072 candidates, totalling 9216 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   12.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   46.2s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.1min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.9min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  7.0min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed:  9.4min
[Parallel(n_jobs=3)]: Done 3194 tasks      | elapsed: 12.2min
[Parallel(n_jobs=3)]: Done 4044 tasks      | elapsed: 15.3min
[Parallel(n_jobs=3)]: Done 4994 tasks      | elapsed: 18.9min
[Parallel(n_jobs=3)]: Done 6044 tasks      | elapsed: 22.8min
[Parallel(n_jobs=3)]: Done 7194 tasks      | elapsed: 27.2min
[Parallel(n_jobs=3)]: Done 8444 tasks      | elapsed: 31.9min
[Parallel(n_jobs=3)]: Done 9216 out of 9216 | elapsed: 34.9min finished



 0.782437264919281 {'nn__batch_size': 45, 'nn__constraint': 3, 'nn__dropout1': 0, 'nn__dropout2': 0.5, 'nn__epochs': 9, 'nn__neurons1': 20, 'nn__neurons2': 20}
[[ 1 10]
 [ 3 16]]
              precision    recall  f1-score   support

           0       0.25      0.09      0.13        11
           1       0.62      0.84      0.71        19

    accuracy                           0.57        30
   macro avg       0.43      0.47      0.42        30
weighted avg       0.48      0.57      0.50        30


Mean scores:
  f1 : 0.6515856584179566
  sensitivity : 0.75
  specificity : 0.28977272727272724
  PPV : 0.6179266136162688
  NPV : 0.5795454545454546


> None of these results are suggesting a very strong predictor, but process 2 is the best of these so this is applied to the full dataset using 3-fold cross validation and the resulting model is saved.

In [11]:
np.random.seed(12)
batch_size = [30, 37, 45]
epochs = [6, 7]
neurons1 = [20, 25]
neurons2 = [20, 30, 40]
dropout1 = [0, 0.3]
dropout2 = [0.2, 0.5]
constraint = [2, 3, 5]
param_grid = dict(nn__batch_size=batch_size, nn__epochs=epochs, nn__neurons1=neurons1, nn__neurons2=neurons2,
                 nn__dropout1=dropout1, nn__dropout2=dropout2, nn__constraint=constraint)
clf = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=3, n_jobs=3, verbose=1, n_iter=1000, random_state=1)
clf.fit(X, y)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   12.4s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   46.6s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 1296 out of 1296 | elapsed:  5.0min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('nn',
                                              <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001D3316534C8>)],
                                      verbose=False),
                   iid='deprecated', n_iter=1000, n_jobs=3,
                   param_distributions={'nn__batch_size': [30, 37, 45],
                                        'nn__constraint': [2, 3, 5],
                                        'nn__dropout1': [0, 0.3],
                                        'nn__dropout2': [0.2, 0.5],
                                        '

In [12]:
print(clf.best_score_, clf.best_params_)
fitted_dl_model_no_ap_ncv = clf.best_estimator_

0.6959349711736044 {'nn__neurons2': 20, 'nn__neurons1': 20, 'nn__epochs': 6, 'nn__dropout2': 0.5, 'nn__dropout1': 0.3, 'nn__constraint': 5, 'nn__batch_size': 30}


In [13]:
fitted_dl_model_no_ap_ncv.named_steps['nn'].model.save('eeg_dl_model_no_ap_ncv.h5')
fitted_dl_model_no_ap_ncv.named_steps['nn'].model = None
joblib.dump(fitted_dl_model_no_ap_ncv, 'eeg_dl_model_no_ap_ncv.pkl')

['eeg_dl_model_no_ap_ncv.pkl']