In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pickle

%load_ext autoreload
%autoreload 2
# Bring utils package onto the path
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))
from utils import DataLoader, RunModel

In [8]:
def train_dataset(filename, label):
    data_loader = DataLoader()
    data_loader.load_data(f"data/{filename}.csv")
    X_train, X_test, y_train, y_test = data_loader.get_data_split(label=label)
    print(f'Training data: {X_train.shape}')
    print(f'Testing data: {X_test.shape}')
    
    rf = RunModel()
    rf.run_model(X_train, X_test, y_train)
    rf.model_performance(X_test , y_test)
    return rf, X_train, X_test, y_train, y_test

### Single Modalities

In [18]:
print("MRI Data - AD TO NC")
mri_rf, mri_X_train, mri_X_test, mri_y_train, mri_y_test = train_dataset('sMRI_AD_NC', 'AD')
mri_rf.save_obj('models/mri_rf', mri_X_train, mri_X_test, mri_y_train, mri_y_test)
print(f"Accuracy: {mri_rf.accuracy}. F1-Score: {mri_rf.f1}. AUC: {mri_rf.auc}")

MRI Data - AD TO NC
Training data: (3567, 35)
Testing data: (892, 35)
Saved.
Accuracy: 0.8452914798206278. F1-Score: 0.5576923076923077. AUC: 0.883378317588844


In [19]:
print("Neuro Data - AD TO NC")
neuro_rf, neuro_X_train, neuro_X_test, neuro_y_train, neuro_y_test = train_dataset('Neuro_AD_NC', 'AD')
neuro_rf.save_obj('models/neuro_rf', neuro_X_train, neuro_X_test, neuro_y_train, neuro_y_test)
print(f"Accuracy: {neuro_rf.accuracy}. F1-Score: {neuro_rf.f1}. AUC: {neuro_rf.auc}")

Neuro Data - AD TO NC
Training data: (3567, 3)
Testing data: (892, 3)
Saved.
Accuracy: 0.7399103139013453. F1-Score: 0.18309859154929578. AUC: 0.5266194331983807


### Multimodal Dataset (Early Fusion)

In [9]:
print("Multimodal Data - AD TO NC")
multimodal_rf, X_train, X_test, y_train, y_test = train_dataset('ADNI_AD_NC', 'AD')
# multimodal_rf.save_model('models/ADNI_AD_NC', X_train, X_test, y_train, y_test)
print(f"Accuracy: {multimodal_rf.accuracy}. F1-Score: {multimodal_rf.f1}. AUC: {multimodal_rf.auc}")

Multimodal Data - AD TO NC
Training data: (1892, 40)
Testing data: (474, 40)
Accuracy: 0.9641350210970464. F1-Score: 0.9594272076372315. AUC: 0.996947113226183


In [14]:
X_train.head()

Unnamed: 0,MCI,Age,MMSE,ADAS11,ADAS13,TotalICVolume,3rdVentricle,4thVentricle,RightAccumbensArea,LeftAccumbensArea,...,LeftPutamen,RightThalamusProper,LeftThalamusProper,RightVentralDC,LeftVentralDC,LeftBasalForebrain,RightBasalForebrain,Right Cortex,Left Cortex,Cerebellar Vermis
1195,0,74.307529,25.0,12.67,22.67,1214750.869,2214.65812,2238.553505,241.27995,203.745164,...,3685.070155,7298.454155,7468.04795,4572.794283,4643.317388,478.013433,436.989497,260663.6536,257481.8668,8818.877328
1119,0,90.634908,27.0,4.0,9.0,1214750.869,3874.994922,2410.256772,272.174796,368.337819,...,4137.34062,6801.228633,6755.325735,4506.89441,4832.57192,355.266133,344.322396,230960.2763,223433.8224,11251.27641
2145,0,79.1,22.0,33.0,45.0,1214750.869,1509.465577,2094.603084,277.02815,258.109154,...,3586.501023,6974.352592,7039.21772,4633.802565,4654.072917,274.325436,413.51519,241353.6809,224360.3688,10525.71834
849,0,80.3,28.0,4.67,11.67,1214750.869,3621.672065,4207.432032,32.264704,0.900791,...,577.898516,5120.015433,5839.174391,3909.352026,4201.290273,178.684223,385.292974,199395.0511,204500.1625,9469.526808
1484,0,71.602053,30.0,1.0,3.0,1214750.869,1507.53526,2710.121118,417.944046,463.161352,...,4712.615707,6596.086677,6904.147807,4377.813159,4508.311277,460.049624,358.432237,260492.6216,263398.0037,10297.19611


In [20]:
multimodal_rf.save_obj('models/fusion_rf', X_train, X_test, y_train, y_test)

Saved.


#### Hyperparameter Tuning - Grid Search

In [4]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Number of features to consider at each split
max_features = ['auto', 'sqrt']
# Max depth in tree
max_depth = [None, 2, 4]
# Min num samples to split a node
min_samples_split = [2, 5]
# Min num samples at each leaf node
min_samples_leaf = [1, 2]
# Choice to boostrap for sample selection
bootstrap = [True, False]

In [5]:
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap
            }
print(param_grid)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [None, 2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [13]:
rf_gr_model = RandomForestClassifier()
rf_gr = GridSearchCV(estimator=rf_gr_model, param_grid=param_grid, cv=10, verbose=2, n_jobs=4)
rf_gr.fit(X_train, y_train)

Fitting 10 folds for each of 480 candidates, totalling 4800 fits
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators

GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [None, 2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100]},
             verbose=2)

In [14]:
rf_gr_pred = rf_gr.predict(X_test)
accuracy = accuracy_score(y_test, rf_gr_pred)
f1 = f1_score(y_test, rf_gr_pred)
rf_gr_probs = rf_gr.predict_proba(X_test)
rf_gr_probs = rf_gr_probs[:, 1]
rf_gr_auc = roc_auc_score(y_test, rf_gr_probs)
print(f"Accuracy: {accuracy}. F1-Score: {f1}. AUC: {rf_gr_auc}")

Accuracy: 0.9662447257383966. F1-Score: 0.9619047619047618. AUC: 0.9972164855885787


In [16]:
rf_gr.best_params_

{'bootstrap': True,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 70}

In [17]:
import pickle
pickle.dump([rf_gr, X_train, X_test, y_train, y_test], open('models/gridsearch_rf', 'wb'))
print("Saved.")

Saved.


### Tree Plot

In [22]:
rf, X_train, X_test, y_train, y_test = pickle.load(open('models/fusion_rf', 'rb'))
feature_list = X_train.columns.values.tolist()

In [24]:
from sklearn.tree import export_graphviz
import pydot
import seaborn as sn

tree_small = rf.model.estimators_[5]
export_graphviz(tree_small, out_file = 'plots/small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)

In [25]:
(graph, ) = pydot.graph_from_dot_file('plots/small_tree.dot')
graph.write_png('plots/small_tree.png')