In [3]:
from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import numpy as np

%load_ext autoreload
%autoreload 2
# Bring utils package onto the path
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))
from utils import DataLoader, RunModel

In [1]:
def train_dataset(filename, label):
    data_loader = DataLoader()
    data_loader.load_data(f"data/{filename}.csv")
    X_train, X_test, y_train, y_test = data_loader.get_data_split(label=label)
    
    rf = RunModel()
    rf.run_model(X_train, X_test, y_train)
    rf.model_performance(X_test , y_test)
    return rf, X_train, X_test, y_train, y_test

### Single Modalities

In [4]:
print("MRI Data - MCI TO NC")
mri_rf, mri_X_train, mri_X_test, mri_y_train, mri_y_test = train_dataset('sMRI_MCI_NC', 'MCI')
# mri_rf.save_model('models/sMRI_RF', mri_X_train, mri_X_test, mri_y_train, mri_y_test)
print(f"Accuracy: {mri_rf.accuracy}. F1-Score: {mri_rf.f1}. AUC: {mri_rf.auc}")

MRI Data - MCI TO NC
Accuracy: 0.7466367713004485. F1-Score: 0.7167919799498748. AUC: 0.816115452787625


In [62]:
mri_rf.save_obj('models/mri_rf', mri_X_train, mri_X_test, mri_y_train, mri_y_test)

Saved.


In [5]:
print("Neuro Data - MCI TO NC")
neuro_rf, neuro_X_train, neuro_X_test, neuro_y_train, neuro_y_test = train_dataset('Neuro_MCI_NC', 'MCI')
# neuro_rf.save_model('models/neuro_RF', neuro_X_train, neuro_X_test, neuro_y_train, neuro_y_test)
print(f"Accuracy: {neuro_rf.accuracy}. F1-Score: {neuro_rf.f1}. AUC: {neuro_rf.auc}")

Neuro Data - MCI TO NC
Accuracy: 0.5044843049327354. F1-Score: 0.4609756097560976. AUC: 0.5150509587495972


In [60]:
neuro_rf.save_obj('models/neuro_rf', neuro_X_train, neuro_X_test, neuro_y_train, neuro_y_test)

Saved.


### Multimodal Dataset (Early Fusion)

In [6]:
print("Multimodal Data - MCI TO NC")
multimodal_rf, X_train, X_test, y_train, y_test = train_dataset('ADNI_MCI_NC', 'MCI')
print(f'Training data: {X_train.shape}')
print(f'Testing data: {X_test.shape}')
# multimodal_rf.save_model('models/ADNI_MCI_NC', X_train, X_test, y_train, y_test)
print(f"Accuracy: {multimodal_rf.accuracy}. F1-Score: {multimodal_rf.f1}. AUC: {multimodal_rf.auc}")

Multimodal Data - MCI TO NC
Training data: (2752, 40)
Testing data: (688, 40)
Accuracy: 0.8561046511627907. F1-Score: 0.8805790108564536. AUC: 0.9396420343788765


In [63]:
multimodal_rf.save_obj('models/fusion_rf', X_train, X_test, y_train, y_test)

Saved.


In [10]:
X_train.head()

Unnamed: 0,AD,Age,MMSE,ADAS11,ADAS13,TotalICVolume,3rdVentricle,4thVentricle,RightAccumbensArea,LeftAccumbensArea,...,LeftPutamen,RightThalamusProper,LeftThalamusProper,RightVentralDC,LeftVentralDC,LeftBasalForebrain,RightBasalForebrain,Right Cortex,Left Cortex,Cerebellar Vermis
1344,0,69.0,28.0,13.67,20.935679,1214750.869,3113.418157,2547.972993,189.373547,192.8015,...,2614.525463,6523.396124,7920.579936,4684.340702,5112.333278,270.641135,198.821322,233575.0881,217262.2083,9255.47532
27,0,76.817043,26.0,5.0,10.0,1214750.869,2349.333922,2110.129014,325.855666,237.984475,...,3988.375719,6496.365958,6500.027257,4198.29023,4593.710589,346.603031,380.77516,265928.7344,259863.1813,12406.92398
3046,0,71.088227,26.0,8.0,13.0,1214750.869,2736.065899,3091.119226,349.381549,325.56008,...,3864.7498,6552.038404,6614.427966,4309.417226,4434.196351,416.308534,497.982143,246751.8535,244934.6157,8782.181669
749,0,71.302053,28.0,4.0,5.0,1214750.869,1743.45009,2215.042768,270.051916,299.361987,...,3310.767529,5381.982244,4590.428861,4252.954709,4532.716153,323.862664,265.151781,244619.4867,254207.4187,8962.710946
507,0,83.4,27.0,8.0,16.0,1214750.869,2519.358378,2855.463364,235.21633,293.32972,...,3551.01396,5453.227185,5284.222014,4026.591083,4252.661699,363.542128,316.670344,243523.4958,241255.5493,8325.07662


#### Hyperparameter Tuning - Grid Search

In [11]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Number of features to consider at each split
max_features = ['auto', 'sqrt']
# Max depth in tree
max_depth = [None, 2, 4]
# Min num samples to split a node
min_samples_split = [2, 5]
# Min num samples at each leaf node
min_samples_leaf = [1, 2]
# Choice to boostrap for sample selection
bootstrap = [True, False]

In [12]:
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap
            }
print(param_grid)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [None, 2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [50]:
import pickle
_, X_train, X_test, y_train, y_test = pickle.load(open('models/saved/ADNI_MCI_NC', 'rb'))

In [54]:
from sklearn.ensemble import RandomForestClassifier
rf_gr_model = RandomForestClassifier()
rf_gr = GridSearchCV(estimator=rf_gr_model, param_grid=param_grid, cv=10, verbose=2, n_jobs=4)
rf_gr.fit(X_train, y_train)

Fitting 10 folds for each of 48 candidates, totalling 480 fits
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=9

GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [None, 2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5], 'n_estimators': [90]},
             verbose=2)

In [56]:
rf_gr_pred = rf_gr.predict(X_test)
accuracy = accuracy_score(y_test, rf_gr_pred)
f1 = f1_score(y_test, rf_gr_pred)
rf_gr_probs = rf_gr.predict_proba(X_test)
rf_gr_probs = rf_gr_probs[:, 1]
rf_gr_auc = roc_auc_score(y_test, rf_gr_probs)
print(f"Accuracy: {accuracy}. F1-Score: {f1}. AUC: {rf_gr_auc}")

Accuracy: 0.8677325581395349. F1-Score: 0.8899637243047159. AUC: 0.9449760765550239


In [57]:
rf_gr.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 90}

In [58]:
pickle.dump([rf_gr, X_train, X_test, y_train, y_test], open('models/gridsearch_rf', 'wb'))
print("Saved.")

Saved.
