In [2]:
import sys
sys.path.append("/home/fehrdelt/data_ssd/MedicalImaging_GIN/gradient_boosting")

import os
#import lightgbm as lgb
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

#from sklearn.manifold import TSNE
#import umap

#import matplotlib.pyplot as plt




In [3]:
#DATA_DIRECTORY = "/home/fehrdelt/data_ssd/data/clinical_data/Full/"
DATA_DIRECTORY = "C:\\Users\\Rivage\\Documents\\1\\programmation\\PFE\\clinical_data\\Full\\"

In [4]:
y = pd.read_csv(DATA_DIRECTORY+"combined_clinical_data_volumes_outcome_TTS_LDDMM.csv", usecols=[31])
y.head()

nan_indexes = y.loc[pd.isna(y["outcome_neurochir_pic"]), :].index # indexes where there is a nan value.
print(nan_indexes)

y = y.dropna()

y = y['outcome_neurochir_pic'].to_numpy()
y = [int(i) for i in y]

Index([76, 102, 104, 113, 118, 125], dtype='int64')


**Rajouter over et undersampling pour Ã§a**

In [5]:
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix

ftwo_scorer = make_scorer(fbeta_score, beta=2)

def confusion_matrix_scorer(clf, X, y):

     y_pred = clf.predict(X)
     cm = confusion_matrix(y, y_pred)

     return {'tn': cm[0, 0], 'fp': cm[0, 1],
             'fn': cm[1, 0], 'tp': cm[1, 1]}

def false_neg_scorer(clf, X, y):

     y_pred = clf.predict(X)
     cm = confusion_matrix(y, y_pred)
     
     return cm[1, 0]

def false_pos_scorer(clf, X, y):

     y_pred = clf.predict(X)
     cm = confusion_matrix(y, y_pred)
     
     return cm[0, 1]

In [6]:
FOLDS = 5

In [7]:
configs_list = ["TTS_ANTS", "TTS_ANTS_hist_match", "TTS_LDDMM", "matlab_ANTS", "matlab_ANTS_hist_match", "matlab_LDDMM", "custom_nn_ANTS", "custom_nn_ANTS_hist_match", "custom_nn_LDDMM"]

print("scikit learn gradient boosting classifier 5 fold stratified cross validation")


for config in configs_list:
    
    X = pd.read_csv(DATA_DIRECTORY+f"combined_clinical_data_volumes_outcome_{config}.csv", usecols=range(2,31))
    X = X.drop(nan_indexes)

    imp = SimpleImputer(missing_values=np.nan, strategy="median")

    imp.fit(X)
    X = imp.transform(X)

    nb_total_samples = len(X)
    


    #model = DecisionTreeClassifier()
    model = HistGradientBoostingClassifier(categorical_features=[False]*14 + [False, False, True, True, False, False, False, False, True, False, True, True, True, True, True])

    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

    #scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    scores = cross_validate(model, X, y, scoring={'F2':ftwo_scorer, 'ROC_AUC':'roc_auc', 'Recall':'recall_macro', 'F1':'f1', 'Brier':"neg_brier_score", 'False_neg_scorer':false_neg_scorer, 'False_pos_scorer':false_pos_scorer}, cv=cv, n_jobs=-1)

    #print(scores)
    print(f" ---------- {config} ---------- ")
        
    roc_auc_metric = np.mean(scores["test_ROC_AUC"])
    print(f'AUC (max): {np.round(roc_auc_metric, 2)}')

    f1_score = np.mean(scores["test_F1"])
    print(f'F1 Score (max): {np.round(f1_score, 2)}')

    f2_score = np.mean(scores["test_F2"])
    print(f'F2 Score (max): {np.round(f2_score, 2)}')

    brier_score = -np.mean(scores["test_Brier"])
    print(f'Brier Score (min): {np.round(brier_score, 2)}')

    # test_False_neg_scorer returns the number of test false negatives -> to get a % we need to divide by the number of test samples*100
    false_neg_score = np.mean(scores["test_False_neg_scorer"])*100/(nb_total_samples/FOLDS)
    print(f'False negative %: {int(np.round(false_neg_score, 0))}%')

    false_pos_score = np.mean(scores["test_False_pos_scorer"])*100/(nb_total_samples/FOLDS)
    print(f'False positive %: {int(np.round(false_pos_score, 0))}%')
    

scikit learn gradient boosting classifier 5 fold stratified cross validation
 ---------- TTS_ANTS ---------- 
AUC (max): 0.86
F1 Score (max): 0.29
F2 Score (max): 0.26
Brier Score (min): 0.08
False negative %: 6%
False positive %: 3%
 ---------- TTS_ANTS_hist_match ---------- 
AUC (max): 0.85
F1 Score (max): 0.27
F2 Score (max): 0.23
Brier Score (min): 0.07
False negative %: 7%
False positive %: 2%
 ---------- TTS_LDDMM ---------- 
AUC (max): 0.86
F1 Score (max): 0.25
F2 Score (max): 0.21
Brier Score (min): 0.07
False negative %: 7%
False positive %: 2%
 ---------- matlab_ANTS ---------- 
AUC (max): 0.86
F1 Score (max): 0.33
F2 Score (max): 0.28
Brier Score (min): 0.07
False negative %: 6%
False positive %: 3%
 ---------- matlab_ANTS_hist_match ---------- 
AUC (max): 0.87
F1 Score (max): 0.28
F2 Score (max): 0.24
Brier Score (min): 0.07
False negative %: 7%
False positive %: 3%
 ---------- matlab_LDDMM ---------- 
AUC (max): 0.86
F1 Score (max): 0.31
F2 Score (max): 0.26
Brier Score (m