## Neural Networks

In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
# load data
df_train = pd.read_csv("data/ADNIMERGE_train.csv")
df_test = pd.read_csv("data/ADNIMERGE_test.csv")
X_train = df_train.drop(['RID', 'DX_bl'], axis=1).copy()
y_train = df_train['DX_bl'].copy()
X_test = df_test.drop(['RID', 'DX_bl'], axis=1).copy()
y_test = df_test['DX_bl'].copy()

In [3]:
# function to help compare the accuracy of models
def score(model, X_train, y_train, X_test, y_test):
    train_acc = model.score(X_train,y_train)
    test_acc = model.score(X_test,y_test)
    test_class0 = model.score(X_test[y_test==0], y_test[y_test==0])
    test_class1 = model.score(X_test[y_test==1], y_test[y_test==1])
    test_class2 = model.score(X_test[y_test==2], y_test[y_test==2])
    return pd.Series([train_acc, test_acc, test_class0, test_class1, test_class2],
                    index = ['Train accuracy', 'Test accuracy', 
                             "Test accuracy CN", "Test accuracy CI", "Test accuracy AD"])

In [4]:
# normalization
cols_continuous = ['APOE4', 'CSF_ABETA', 'CSF_TAU',  'CSF_PTAU', 
                   'FDG', 'FDG_slope', 'AV45', 'AV45_slope',
                   'ADAS13', 'ADAS13_slope', 'MMSE', 'MMSE_slope',
                   'RAVLT_immediate', 'RAVLT_immediate_slope', 'RAVLT_learning',
                   'RAVLT_learning_slope', 'RAVLT_forgetting', 'RAVLT_forgetting_slope',
                   'RAVLT_perc_forgetting', 'RAVLT_perc_forgetting_slope', 'MOCA',
                   'MOCA_slope', 'EcogPtMem', 'EcogPtMem_slope', 'EcogPtLang',
                   'EcogPtLang_slope', 'EcogPtVisspat', 'EcogPtVisspat_slope',
                   'EcogPtPlan', 'EcogPtPlan_slope', 'EcogPtOrgan', 'EcogPtOrgan_slope',
                   'EcogPtDivatt', 'EcogPtDivatt_slope', 'EcogSPMem', 'EcogSPMem_slope',
                   'EcogSPLang', 'EcogSPLang_slope', 'EcogSPVisspat',
                   'EcogSPVisspat_slope', 'EcogSPPlan', 'EcogSPPlan_slope', 'EcogSPOrgan',
                   'EcogSPOrgan_slope', 'EcogSPDivatt', 'EcogSPDivatt_slope', 'FAQ',
                   'FAQ_slope', 'Ventricles', 'Ventricles_slope', 'Hippocampus',
                   'Hippocampus_slope', 'WholeBrain', 'WholeBrain_slope', 'Entorhinal',
                   'Entorhinal_slope', 'Fusiform', 'Fusiform_slope', 'MidTemp',
                   'MidTemp_slope', 'ICV', 'ICV_slope']

X_train_std = X_train.copy()
X_test_std = X_test.copy()
for i in cols_continuous:
    col_mean = np.mean(X_train_std[i])
    col_sd = np.std(X_train_std[i])
    if col_sd < 1e-10*col_mean:
        X_train_std.loc[i] = (X_train_std[i]-col_mean)/col_sd
        X_test_std.loc[i] = (X_test_std[i]-col_mean)/col_sd

In [5]:
# find the best parameters
parameters = {'alpha': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
              'hidden_layer_sizes': [(50, 3), (50, 10), (50, 25), (50, 50),
                                     (100, 3), (100, 10), (100, 25), (100, 50),
                                     (300, 3), (300, 10), (300, 25), (300, 50),
                                     (500, 3), (500, 10), (500, 25), (500, 50)]}
mlp = MLPClassifier(solver = 'lbfgs', activation='logistic', random_state=9001)
mlp_cv = GridSearchCV(mlp, parameters)
mlp_cv.fit(X_train_std, y_train)
best_score = np.argmax(mlp_cv.cv_results_['mean_test_score'])
result = mlp_cv.cv_results_['params'][best_score]
a = result['alpha']
hidden_layer = result['hidden_layer_sizes']
mlp = MLPClassifier(solver = 'lbfgs', activation='logistic', random_state=9001,
                    alpha = a, hidden_layer_sizes=hidden_layer)
mlp.fit(X_train_std, y_train)
print("Optimal parameters")
print("L2 penalty parameter: ", a)
print("Number of neurons in the hidden layer: ", hidden_layer)
print('\n-----------------\n')
print("Training accuracy: ", mlp.score(X_train_std, y_train))
print("Test accuracy: ", mlp.score(X_test_std, y_test))
nn_score = score(mlp, X_train_std, y_train, X_test_std, y_test)

Optimal parameters
L2 penalty parameter:  0.001
Number of neurons in the hidden layer:  (100, 25)

-----------------

Training accuracy:  0.573268921095
Test accuracy:  0.567901234568


In [6]:
# random forest to compare with
rf_best = RandomForestClassifier(n_estimators=32, max_depth=6, random_state=9001)
rf_best.fit(X_train, y_train)
rf_score = score(rf_best, X_train, y_train, X_test, y_test)

In [7]:
score_df = pd.DataFrame({"Neural Network": nn_score,
                         "Random Forest": rf_score})
score_df

Unnamed: 0,Neural Network,Random Forest
Train accuracy,0.573269,0.921095
Test accuracy,0.567901,0.783951
Test accuracy CN,0.0,0.5
Test accuracy CI,0.989247,0.892473
Test accuracy AD,0.0,0.851852
