## Neural Networks

In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
# load data
df_train = pd.read_csv("data/ADNIMERGE_train.csv")
df_test = pd.read_csv("data/ADNIMERGE_test.csv")
X_train = df_train.drop(['RID', 'DX_bl'], axis=1).copy()
y_train = df_train['DX_bl'].copy()
X_test = df_test.drop(['RID', 'DX_bl'], axis=1).copy()
y_test = df_test['DX_bl'].copy()

In [3]:
# function to help compare the accuracy of models
def score(model, X_train, y_train, X_test, y_test):
    train_acc = model.score(X_train,y_train)
    test_acc = model.score(X_test,y_test)
    test_class0 = model.score(X_test[y_test==0], y_test[y_test==0])
    test_class1 = model.score(X_test[y_test==1], y_test[y_test==1])
    test_class2 = model.score(X_test[y_test==2], y_test[y_test==2])
    return pd.Series([train_acc, test_acc, test_class0, test_class1, test_class2],
                    index = ['Train accuracy', 'Test accuracy', 
                             "Test accuracy CN", "Test accuracy CI", "Test accuracy AD"])

In [92]:
cols_standardize = [
    c for c in X_train.columns 
    if (not c.startswith('PT')) or (c=='PTEDUCAT')]

X_train_std = X_train.copy()
X_test_std = X_test.copy()
for c in cols_standardize:
    col_mean = np.mean(X_train[c])
    col_sd = np.std(X_train[c])
    if col_sd > (1e-10)*col_mean:
        X_train_std[c] = (X_train[c]-col_mean)/col_sd
        X_test_std[c] = (X_test[c]-col_mean)/col_sd

In [97]:
print(X_train_std.shape)
X_train_std.head()

(621, 74)


Unnamed: 0,PTGENDER,PTEDUCAT,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other_PI,PTRACCAT_More_than_one,PTRACCAT_Unknown,PTRACCAT_White,PTETHCAT_Not_Hisp/Latino,PTMARRY_Married,...,WholeBrain,WholeBrain_slope,Entorhinal,Entorhinal_slope,Fusiform,Fusiform_slope,MidTemp,MidTemp_slope,ICV,ICV_slope
0,0,-2.852257,0,0,0,0,0,1,1,0,...,-1.7615,-0.567555,-0.820814,-1.269796,-1.426968,0.156847,-2.102069,-0.192827,-1.574482,0.093937
1,1,1.376909,0,0,0,0,0,1,1,1,...,-0.134464,-0.028641,-0.070387,0.188014,0.721399,-0.067438,0.019784,0.506511,-0.489132,-0.265646
2,0,0.60797,0,0,0,0,0,1,1,1,...,-1.300396,0.31072,0.456478,-0.56084,0.292776,0.016824,-0.650452,0.22414,-1.239633,-0.014198
3,0,-0.16097,0,0,0,0,0,1,1,0,...,-9.4e-05,-0.003749,0.006635,-0.003683,0.010325,0.015345,0.018697,0.004091,-0.005136,0.004314
4,1,-0.16097,0,0,0,0,0,1,1,0,...,-9.4e-05,-0.003749,0.006635,-0.003683,0.010325,0.015345,0.018697,0.004091,1.652198,-0.047345


In [103]:
# find the best parameters
parameters = {'alpha': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2],
              'hidden_layer_sizes': [(50), (100), (300), 
                                     (50, 10), (50, 25), (50, 50), 
                                     (100, 10), (100, 25), (100, 50)]}
mlp = MLPClassifier(solver = 'lbfgs', activation='logistic', random_state=9001)
mlp_cv = GridSearchCV(mlp, parameters)
mlp_cv.fit(X_train_std, y_train)
best_score = np.argmax(mlp_cv.cv_results_['mean_test_score'])
result = mlp_cv.cv_results_['params'][best_score]
a = result['alpha']
hidden_layer = result['hidden_layer_sizes']
mlp = MLPClassifier(solver = 'lbfgs', activation='logistic', random_state=9001,
                    alpha = a, hidden_layer_sizes=hidden_layer)
mlp.fit(X_train_std, y_train)
print("Optimal parameters")
print("L2 penalty parameter: ", a)
print("Number of neurons in the hidden layer: ", hidden_layer)
print('\n-----------------\n')
print("Training accuracy: ", mlp.score(X_train_std, y_train))
print("Test accuracy: ", mlp.score(X_test_std, y_test))
nn_score = score(mlp, X_train_std, y_train, X_test_std, y_test)

Optimal parameters
L2 penalty parameter:  10.0
Number of neurons in the hidden layer:  100

-----------------

Training accuracy:  0.82769726248
Test accuracy:  0.777777777778


In [104]:
# random forest to compare with
rf_best = RandomForestClassifier(n_estimators=32, max_depth=6, random_state=9001)
rf_best.fit(X_train, y_train)
rf_score = score(rf_best, X_train, y_train, X_test, y_test)

In [105]:
score_df = pd.DataFrame({"Neural Network": nn_score,
                         "Random Forest": rf_score})
score_df

Unnamed: 0,Neural Network,Random Forest
Train accuracy,0.827697,0.921095
Test accuracy,0.777778,0.783951
Test accuracy CN,0.547619,0.5
Test accuracy CI,0.849462,0.892473
Test accuracy AD,0.888889,0.851852
