In [2]:
import sys
sys.path.append("/home/fehrdelt/data_ssd/MedicalImaging_GIN/gradient_boosting")

import os
import lightgbm as lgb
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.manifold import TSNE
import umap

import matplotlib.pyplot as plt




In [5]:
DATA_DIRECTORY = "/home/fehrdelt/data_ssd/data/clinical_data/Full/"

In [6]:
y = pd.read_csv(DATA_DIRECTORY+"combined_clinical_data_volumes_outcome_TTS_LDDMM.csv", usecols=[31])
y.head()

nan_indexes = y.loc[pd.isna(y["outcome_neurochir_pic"]), :].index # indexes where there is a nan value.
print(nan_indexes)

y = y.dropna()

y = y['outcome_neurochir_pic'].to_numpy()
y = [int(i) for i in y]

Index([76, 102, 104, 113, 118, 125], dtype='int64')


In [7]:
configs_list = ["TTS_ANTS", "TTS_ANTS_hist_match", "TTS_LDDMM", "matlab_ANTS", "matlab_ANTS_hist_match", "matlab_LDDMM", "custom_nn_ANTS", "custom_nn_ANTS_hist_match", "custom_nn_LDDMM"]

print("scikit learn gradient boosting classifier 5 fold stratified cross validation")


for config in configs_list:

    X = pd.read_csv(DATA_DIRECTORY+f"combined_clinical_data_volumes_outcome_{config}.csv", usecols=range(2,31))
    X = X.drop(nan_indexes)

    imp = SimpleImputer(missing_values=np.nan, strategy="median")

    imp.fit(X)
    X = imp.transform(X)


    #model = DecisionTreeClassifier()
    model = HistGradientBoostingClassifier(categorical_features=[False]*14 + [False, False, True, True, False, False, False, False, True, False, True, True, True, True, True])

    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

    #scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    scores = cross_validate(model, X, y, scoring=['roc_auc', 'recall_macro'], cv=cv, n_jobs=-1)

    #print(scores)
    print(" ------------------------------ ")
    print(config+' Mean ROC AUC: %.3f' % np.mean(scores['test_roc_auc']))
    print(config+' Mean recall macro: %.3f' % np.mean(scores['test_recall_macro']))
    

scikit learn gradient boosting classifier 5 fold stratified cross validation
 ------------------------------ 
TTS_ANTS Mean ROC AUC: 0.859
TTS_ANTS Mean recall macro: 0.602
 ------------------------------ 
TTS_ANTS_hist_match Mean ROC AUC: 0.849
TTS_ANTS_hist_match Mean recall macro: 0.596
 ------------------------------ 
TTS_LDDMM Mean ROC AUC: 0.857
TTS_LDDMM Mean recall macro: 0.585
 ------------------------------ 
matlab_ANTS Mean ROC AUC: 0.862
matlab_ANTS Mean recall macro: 0.616
 ------------------------------ 
matlab_ANTS_hist_match Mean ROC AUC: 0.867
matlab_ANTS_hist_match Mean recall macro: 0.596
 ------------------------------ 
matlab_LDDMM Mean ROC AUC: 0.861
matlab_LDDMM Mean recall macro: 0.609
 ------------------------------ 
custom_nn_ANTS Mean ROC AUC: 0.853
custom_nn_ANTS Mean recall macro: 0.594
 ------------------------------ 
custom_nn_ANTS_hist_match Mean ROC AUC: 0.860
custom_nn_ANTS_hist_match Mean recall macro: 0.584
 ------------------------------ 
custom_nn_