In [2]:
import numpy as np
import nilearn
import pandas as pd
import os
import pycaret
from nilearn.connectome import sym_matrix_to_vec
import pickle
from pycaret.classification import *


# Prepare Data for Pycaret

In [3]:
path_cobre_scz = '/Users/VictoriaShevchenko/Documents/PhD/SCZ/COBRE/clean_data/SCZ'
path_cobre_controls = '/Users/VictoriaShevchenko/Documents/PhD/SCZ/COBRE/clean_data/controls'

path_la5c = '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data'
path_ktt = '/Users/VictoriaShevchenko/Documents/PhD/SCZ/SRPBS_1600/KTT/clean_data'

In [4]:
participants_cobre_scz = pd.read_csv(f'{path_cobre_scz}/participants.tsv', sep='\t')
participants_cobre_controls = pd.read_csv(f'{path_cobre_controls}/participants.tsv', sep='\t')

participants_la5c = pd.read_csv(f'{path_la5c}/participants.tsv', sep='\t')
participants_ktt = pd.read_csv(f'{path_ktt}/participants.tsv', sep='\t')

In [5]:
participants_cobre_scz = participants_cobre_scz.rename(columns = {"dx": "diagnosis"})
participants_cobre_controls = participants_cobre_controls.rename(columns = {"dx": "diagnosis"})
participants_ktt = participants_ktt.rename(columns = {"diag": "diagnosis"})

participants_la5c = participants_la5c[participants_la5c['diagnosis'].isin(['CONTROL', 'SCHZ'])]

In [6]:
# only pick participant_id and diagnosis columns
participants_cobre_scz = participants_cobre_scz[['participant_id', 'diagnosis']]
participants_cobre_controls = participants_cobre_controls[['participant_id', 'diagnosis']]
participants_la5c = participants_la5c[['participant_id', 'diagnosis']]
participants_ktt = participants_ktt[['participant_id', 'diagnosis']]

# add dataset column
participants_cobre_scz['dataset'] = 'COBRE'
participants_cobre_controls['dataset'] = 'COBRE'
participants_la5c['dataset'] = 'LA5c'
participants_ktt['dataset'] = 'KTT'


In [7]:
participants_cobre_scz["diagnosis"] = 1
participants_cobre_controls["diagnosis"] = 0
# set diagnosis to 1 for SCHZ and 0 for controls
participants_la5c["diagnosis"] = participants_la5c["diagnosis"].map({'CONTROL': 0, 'SCHZ': 1})
participants_ktt["diagnosis"] = participants_ktt["diagnosis"].map({0: 0, 4: 1})


In [7]:
cobre_matrices = []
for subject in participants_cobre_scz['participant_id']:
    try:
        matrix = np.load(f'{path_cobre_scz}/sub-{subject}/func/z-conn-matrix-sub-{subject}-rest-schaefer1000.npy')
        matrix = pd.DataFrame(sym_matrix_to_vec(matrix, discard_diagonal=True)).T
        matrix["participant_id"] = subject
        cobre_matrices.append(matrix)
    except FileNotFoundError as e: #ouput "matrix not found", remove the subject from the dataframe and continue
        print(f"Matrix not found for subject {subject}: {e}.")
        participants_cobre_scz = participants_cobre_scz[participants_cobre_scz['participant_id'] != subject]
        continue


for subject in participants_cobre_controls['participant_id']:
    try:
        matrix = np.load(f'{path_cobre_controls}/sub-{subject}/func/z-conn-matrix-sub-{subject}-rest-schaefer1000.npy')
        matrix = pd.DataFrame(sym_matrix_to_vec(matrix, discard_diagonal=True)).T
        matrix["participant_id"] = subject
        cobre_matrices.append(matrix)
    except FileNotFoundError as e: #ouput "matrix not found", remove the subject from the dataframe and continue
        print(f"Matrix not found for subject {subject}: {e}.")
        participants_cobre_controls = participants_cobre_controls[participants_cobre_controls['participant_id'] != subject]
        continue


Matrix not found for subject A00014636: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/COBRE/clean_data/SCZ/sub-A00014636/func/z-conn-matrix-sub-A00014636-rest-schaefer1000.npy'.


In [8]:
la5c_matrices = []
for subject in participants_la5c['participant_id']:
    try:
        matrix = np.load(f'{path_la5c}/{subject}/func/z-conn-matrix-{subject}-rest-schaefer1000.npy')
        matrix = pd.DataFrame(sym_matrix_to_vec(matrix, discard_diagonal=True))
        matrix["participant_id"] = subject
        la5c_matrices.append(matrix)
    except FileNotFoundError as e: #ouput "matrix not found", remove the subject from the dataframe and continue
        print(f"Matrix not found for subject {subject}: {e}.")
        participants_la5c = participants_la5c[participants_la5c['participant_id'] != subject].reset_index(drop=True)
        continue

Matrix not found for subject sub-10193: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data/sub-10193/func/z-conn-matrix-sub-10193-rest-schaefer1000.npy'.
Matrix not found for subject sub-10299: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data/sub-10299/func/z-conn-matrix-sub-10299-rest-schaefer1000.npy'.
Matrix not found for subject sub-10428: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data/sub-10428/func/z-conn-matrix-sub-10428-rest-schaefer1000.npy'.
Matrix not found for subject sub-10501: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data/sub-10501/func/z-conn-matrix-sub-10501-rest-schaefer1000.npy'.
Matrix not found for subject sub-10948: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data/sub-10948/func/z-conn-matrix-sub-10948-rest-schaefer1000.n

In [9]:
ktt_matrices = []
for subject in participants_ktt['participant_id']:
    try:
        matrix = np.load(f'{path_ktt}/{subject}/func/z-conn-matrix-{subject}-rest-schaefer1000.npy')
        matrix = pd.DataFrame(sym_matrix_to_vec(matrix, discard_diagonal=True))
        matrix["participant_id"] = subject
        ktt_matrices.append(matrix)
    except FileNotFoundError as e: #ouput "matrix not found", remove the subject from the dataframe and continue
        print(f"Matrix not found for subject {subject}: {e}.")
        participants_ktt = participants_ktt[participants_ktt['participant_id'] != subject]
        continue

In [10]:
cobre_matrices = pd.concat(cobre_matrices, ignore_index=True)
la5c_matrices = pd.concat(la5c_matrices, ignore_index=True)
ktt_matrices = pd.concat(ktt_matrices, ignore_index=True)

In [11]:
matrices = pd.concat([cobre_matrices, la5c_matrices, ktt_matrices], ignore_index=True, axis = 0)

In [12]:
participants = pd.concat([participants_cobre_scz, participants_cobre_controls, participants_la5c, participants_ktt], ignore_index=True, axis = 0)

In [13]:
data = pd.merge(matrices, participants, on='participant_id')

In [14]:
data = data.drop(columns=['participant_id', 'dataset'])

In [15]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,499491,499492,499493,499494,499495,499496,499497,499498,499499,diagnosis
0,0.316018,0.398603,0.559165,0.321593,0.730107,0.515045,-0.232108,0.263316,0.304452,0.157879,...,0.927735,0.642035,1.706551,0.454816,1.107561,0.797940,0.473853,0.971337,1.067367,1
1,0.509892,0.456055,0.687725,0.188446,0.901553,0.489064,0.619088,0.611821,0.960266,0.361279,...,1.214872,1.505645,1.422630,0.898804,0.725651,0.981397,1.523692,1.119500,1.677067,1
2,0.724815,0.816604,1.112925,0.125689,0.329958,0.190370,0.612875,0.279698,0.517683,-0.111345,...,0.893457,0.600211,1.078061,0.296381,0.644819,0.479703,0.991282,0.257461,1.108251,1
3,0.522150,0.571995,0.679807,0.239536,0.492589,0.366672,0.446235,0.632842,0.590239,0.229741,...,0.612295,0.661265,0.494329,0.583290,0.473774,0.208284,1.111928,0.523424,0.887664,1
4,0.119225,0.423197,0.576157,0.391305,0.712178,0.985341,0.089581,0.766389,0.935538,0.794467,...,0.461221,0.456221,0.610659,0.423758,0.344830,0.663136,0.704022,0.242100,0.889444,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,0.655666,0.494201,0.800881,0.285964,0.565003,0.581054,0.204118,0.148276,0.412323,0.031338,...,0.541302,0.593199,1.261298,-0.053800,0.764658,1.196214,1.509966,0.906632,1.039177,0
438,0.732985,-0.196750,0.183309,-0.061203,0.229907,0.662011,0.333043,0.597345,0.504695,0.193596,...,0.970029,0.873027,1.210271,0.522735,1.199122,0.841579,1.246116,0.749762,1.433632,0
439,0.934060,0.329803,0.119423,0.216826,0.158884,0.262061,-0.056514,0.002856,0.341852,-0.029275,...,0.604717,0.882536,0.395092,0.175504,0.456599,0.031720,0.747380,0.138312,0.845100,0
440,0.977340,0.728325,0.812409,0.654763,1.093813,0.671781,0.504081,0.717877,0.644427,0.473394,...,0.849338,0.864503,1.131685,0.396251,0.802067,0.580894,1.419934,1.074154,1.295092,0


# Model Selection

In [49]:
s = setup(data, target = 'diagnosis', session_id = 123, log_experiment = True, experiment_name = 'diagnosis')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 499501)"
4,Transformed data shape,"(442, 499501)"
5,Transformed train set shape,"(309, 499501)"
6,Transformed test set shape,"(133, 499501)"
7,Numeric features,499500
8,Preprocess,True
9,Imputation type,simple


In [53]:
exp = ClassificationExperiment()
exp.setup(data, target = 'diagnosis', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 499501)"
4,Transformed data shape,"(442, 499501)"
5,Transformed train set shape,"(309, 499501)"
6,Transformed test set shape,"(133, 499501)"
7,Numeric features,499500
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7f7741c070d0>

In [54]:
best = compare_models(fold = 10)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

# Load saved models

In [20]:
# load pickled file
with open('results/top_10_models_scz.pkl', 'rb') as file:
    top_10_models = pickle.load(file)


# Model Selection for Single Gradients

In [122]:
np.arange(0,9)

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [8]:
participants = pd.concat([participants_cobre_scz, participants_cobre_controls, participants_la5c, participants_ktt], ignore_index=True, axis = 0)
for grad_comb in range(1,11):
    cobre_grads = []
    for subject in participants_cobre_scz['participant_id']:
        try:
            gradients = np.concatenate([np.load(f'{path_cobre_scz}/sub-{subject}/func/aligned-10gradients-sub-{subject}-rest-schaefer1000.npy')[0][:, i] for i in np.arange(0,grad_comb)])
            gradients = pd.DataFrame(gradients).T
            gradients["participant_id"] = subject
            cobre_grads.append(gradients)
        except FileNotFoundError as e:
            print(f"Gradients not found for subject {subject}: {e}.")
            participants_cobre_scz = participants_cobre_scz[participants_cobre_scz['participant_id'] != subject]
            continue

    for subject in participants_cobre_controls['participant_id']:
        try:
            gradients = np.concatenate([np.load(f'{path_cobre_controls}/sub-{subject}/func/aligned-10gradients-sub-{subject}-rest-schaefer1000.npy')[0][:, i] for i in np.arange(0,grad_comb)])
            gradients = pd.DataFrame(gradients).T
            gradients["participant_id"] = subject
            cobre_grads.append(gradients)
        except FileNotFoundError as e:
            print(f"Matrix not found for subject {subject}: {e}.")
            participants_cobre_controls = participants_cobre_controls[participants_cobre_controls['participant_id'] != subject]
            continue

    la5c_grads = []
    for subject in participants_la5c['participant_id']:
        try:
            gradients = np.concatenate([np.load(f'{path_la5c}/{subject}/func/aligned-10gradients-{subject}-rest-schaefer1000.npy')[0][:, i] for i in np.arange(0,grad_comb)])
            gradients = pd.DataFrame(gradients).T
            gradients["participant_id"] = subject
            la5c_grads.append(gradients)
        except FileNotFoundError as e: #ouput "matrix not found", remove the subject from the dataframe and continue
            print(f"Gradients not found for subject {subject}: {e}.")
            participants_la5c = participants_la5c[participants_la5c['participant_id'] != subject].reset_index(drop=True)
            continue
    
    ktt_grads = []
    for subject in participants_ktt['participant_id']:
        try:
            gradients = np.concatenate([np.load(f'{path_ktt}/{subject}/func/aligned-10gradients-{subject}-rest-schaefer1000.npy')[0][:, i] for i in np.arange(0,grad_comb)])
            gradients = pd.DataFrame(gradients).T
            gradients["participant_id"] = subject
            ktt_grads.append(gradients)
        except FileNotFoundError as e: #ouput "matrix not found", remove the subject from the dataframe and continue
            print(f"Gradients not found for subject {subject}: {e}.")
            participants_ktt = participants_ktt[participants_ktt['participant_id'] != subject].reset_index(drop=True)
            continue

    cobre_grads = pd.concat(cobre_grads, ignore_index=True)
    la5c_grads = pd.concat(la5c_grads, ignore_index=True)
    ktt_grads = pd.concat(ktt_grads, ignore_index=True)

    grads = pd.concat([cobre_grads, la5c_grads, ktt_grads], ignore_index=True, axis = 0)
    data_grad = pd.merge(grads, participants, on='participant_id')
    data_grad = data_grad.drop(columns=['participant_id', 'dataset'])

    s = setup(data_grad, target = 'diagnosis', session_id = 123)
    print(f"---------------------------TESTING THE COMBINATION OF {grad_comb} GRADIENTS---------------------------")
    top_10_models_grad = compare_models(fold = 10, n_select = 10)
    save_model(top_10_models_grad, f'results/top_10_models_grad_comb_{grad_comb}.pkl')

    leader_board = get_leaderboard(top_10_models_grad)
    leader_board.to_csv(f'results/model_leaderboard_grad_comb_{grad_comb}.csv')
    

Gradients not found for subject A00014636: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/COBRE/clean_data/SCZ/sub-A00014636/func/aligned-10gradients-sub-A00014636-rest-schaefer1000.npy'.
Gradients not found for subject sub-10193: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data/sub-10193/func/aligned-10gradients-sub-10193-rest-schaefer1000.npy'.
Gradients not found for subject sub-10299: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data/sub-10299/func/aligned-10gradients-sub-10299-rest-schaefer1000.npy'.
Gradients not found for subject sub-10428: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data/sub-10428/func/aligned-10gradients-sub-10428-rest-schaefer1000.npy'.
Gradients not found for subject sub-10501: [Errno 2] No such file or directory: '/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data/sub-10

Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 1001)"
4,Transformed data shape,"(442, 1001)"
5,Transformed train set shape,"(309, 1001)"
6,Transformed test set shape,"(133, 1001)"
7,Numeric features,1000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 1 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7153,0.7094,0.5712,0.6603,0.6045,0.3855,0.3928,0.587
svm,SVM - Linear Kernel,0.7056,0.0,0.5871,0.6334,0.6018,0.3705,0.3758,0.194
et,Extra Trees Classifier,0.6794,0.7426,0.3015,0.7106,0.4138,0.2417,0.2881,0.237
lightgbm,Light Gradient Boosting Machine,0.6794,0.7078,0.4273,0.6225,0.5041,0.2798,0.2919,0.205
gbc,Gradient Boosting Classifier,0.6762,0.7273,0.4121,0.6442,0.4945,0.2725,0.2925,0.255
lda,Linear Discriminant Analysis,0.673,0.7105,0.4947,0.589,0.5302,0.2851,0.2918,0.254
rf,Random Forest Classifier,0.6728,0.7158,0.3265,0.6955,0.4231,0.2363,0.2787,0.231
ridge,Ridge Classifier,0.6568,0.0,0.5538,0.5551,0.5511,0.2741,0.2763,0.18
dt,Decision Tree Classifier,0.638,0.6239,0.5636,0.5433,0.5484,0.2483,0.2512,0.196
nb,Naive Bayes,0.6275,0.6943,0.703,0.5083,0.5874,0.2621,0.2776,0.225


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 2001)"
4,Transformed data shape,"(442, 2001)"
5,Transformed train set shape,"(309, 2001)"
6,Transformed test set shape,"(133, 2001)"
7,Numeric features,2000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 2 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7376,0.758,0.5705,0.7102,0.6252,0.4277,0.4395,0.213
lda,Linear Discriminant Analysis,0.7247,0.762,0.5447,0.6798,0.5946,0.3934,0.4048,0.223
ridge,Ridge Classifier,0.7214,0.0,0.5705,0.6663,0.6096,0.396,0.4025,0.228
et,Extra Trees Classifier,0.7053,0.7549,0.3348,0.7667,0.4601,0.302,0.3523,0.243
svm,SVM - Linear Kernel,0.7052,0.0,0.5455,0.6618,0.5882,0.3629,0.3739,0.225
lightgbm,Light Gradient Boosting Machine,0.6955,0.7188,0.4439,0.6835,0.5196,0.314,0.3399,0.218
rf,Random Forest Classifier,0.6924,0.743,0.3182,0.7603,0.4389,0.2729,0.3272,0.284
nb,Naive Bayes,0.6922,0.739,0.6447,0.5895,0.6104,0.3573,0.3642,0.206
gbc,Gradient Boosting Classifier,0.6694,0.7209,0.4015,0.6478,0.4802,0.2552,0.2808,0.28
ada,Ada Boost Classifier,0.6408,0.6697,0.4697,0.5472,0.4998,0.2232,0.2279,0.234


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 3001)"
4,Transformed data shape,"(442, 3001)"
5,Transformed train set shape,"(309, 3001)"
6,Transformed test set shape,"(133, 3001)"
7,Numeric features,3000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 3 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7537,0.0,0.6212,0.7296,0.6545,0.4674,0.4831,0.291
lr,Logistic Regression,0.7276,0.7944,0.5864,0.6817,0.6173,0.4101,0.4219,0.402
lda,Linear Discriminant Analysis,0.7276,0.7672,0.5348,0.6862,0.5872,0.3932,0.4092,0.411
svm,SVM - Linear Kernel,0.7214,0.0,0.6121,0.6748,0.6189,0.4042,0.4228,0.299
nb,Naive Bayes,0.7116,0.7486,0.6364,0.6359,0.6254,0.3927,0.4029,0.334
rf,Random Forest Classifier,0.7085,0.7601,0.3515,0.797,0.4712,0.3125,0.3703,0.541
lightgbm,Light Gradient Boosting Machine,0.6986,0.7417,0.4848,0.6636,0.5355,0.3267,0.35,1.498
et,Extra Trees Classifier,0.6794,0.7402,0.3182,0.7212,0.4294,0.2484,0.2967,0.49
knn,K Neighbors Classifier,0.6501,0.6579,0.4765,0.5565,0.5038,0.2393,0.2463,0.323
gbc,Gradient Boosting Classifier,0.6471,0.7214,0.3682,0.5841,0.4343,0.2012,0.2203,5.709


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 4001)"
4,Transformed data shape,"(442, 4001)"
5,Transformed train set shape,"(309, 4001)"
6,Transformed test set shape,"(133, 4001)"
7,Numeric features,4000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 4 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7537,0.807,0.6038,0.7375,0.6438,0.4616,0.4817,0.498
ridge,Ridge Classifier,0.7408,0.0,0.5955,0.6953,0.6315,0.4359,0.4458,0.355
lda,Linear Discriminant Analysis,0.7406,0.7947,0.578,0.7209,0.6184,0.4299,0.4536,0.507
svm,SVM - Linear Kernel,0.7278,0.0,0.5621,0.7088,0.5981,0.4032,0.4235,0.382
nb,Naive Bayes,0.7245,0.7534,0.653,0.6387,0.6411,0.4181,0.4229,0.379
rf,Random Forest Classifier,0.7018,0.7589,0.325,0.7345,0.4388,0.2884,0.3315,0.64
knn,K Neighbors Classifier,0.6892,0.7004,0.5197,0.5966,0.547,0.3192,0.3217,0.411
lightgbm,Light Gradient Boosting Machine,0.6794,0.7363,0.4439,0.6476,0.5073,0.2842,0.3057,2.068
ada,Ada Boost Classifier,0.6761,0.7037,0.5114,0.5857,0.536,0.2931,0.3004,1.797
gbc,Gradient Boosting Classifier,0.6696,0.7152,0.4098,0.5957,0.4751,0.2527,0.2651,7.926


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 5001)"
4,Transformed data shape,"(442, 5001)"
5,Transformed train set shape,"(309, 5001)"
6,Transformed test set shape,"(133, 5001)"
7,Numeric features,5000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 5 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.7698,0.7989,0.6197,0.7521,0.6671,0.4955,0.5113,0.651
ridge,Ridge Classifier,0.7539,0.0,0.6212,0.7223,0.6473,0.4642,0.4841,0.446
lr,Logistic Regression,0.7504,0.8076,0.5955,0.7388,0.638,0.4538,0.4769,0.53
svm,SVM - Linear Kernel,0.731,0.0,0.5864,0.6942,0.6142,0.4138,0.4323,0.481
nb,Naive Bayes,0.7115,0.7549,0.6273,0.6192,0.6194,0.3876,0.391,0.47
knn,K Neighbors Classifier,0.7083,0.7205,0.5015,0.6387,0.5559,0.3487,0.355,0.468
et,Extra Trees Classifier,0.7053,0.7642,0.3015,0.829,0.4229,0.2892,0.361,0.649
rf,Random Forest Classifier,0.686,0.7241,0.2591,0.8225,0.3737,0.2392,0.3158,0.72
lightgbm,Light Gradient Boosting Machine,0.6729,0.7148,0.4364,0.646,0.4909,0.2686,0.2947,2.638
gbc,Gradient Boosting Classifier,0.6601,0.7404,0.403,0.6254,0.4724,0.2387,0.2623,9.979


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 6001)"
4,Transformed data shape,"(442, 6001)"
5,Transformed train set shape,"(309, 6001)"
6,Transformed test set shape,"(133, 6001)"
7,Numeric features,6000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 6 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7635,0.0,0.6379,0.7414,0.6688,0.4887,0.5081,0.496
lr,Logistic Regression,0.7603,0.8215,0.5962,0.7674,0.6528,0.4758,0.4992,0.69
svm,SVM - Linear Kernel,0.757,0.0,0.6129,0.7333,0.6554,0.4719,0.4861,0.514
knn,K Neighbors Classifier,0.7213,0.7144,0.4523,0.733,0.5445,0.3643,0.394,0.549
lda,Linear Discriminant Analysis,0.7147,0.7941,0.5189,0.6956,0.576,0.3701,0.392,0.798
nb,Naive Bayes,0.7018,0.7582,0.5939,0.6175,0.5994,0.3632,0.368,0.531
rf,Random Forest Classifier,0.6955,0.7513,0.2841,0.7933,0.4109,0.2668,0.3328,0.82
et,Extra Trees Classifier,0.689,0.7318,0.2667,0.7475,0.3815,0.2458,0.3035,0.695
gbc,Gradient Boosting Classifier,0.6759,0.7268,0.3932,0.6317,0.4769,0.2628,0.2816,11.868
lightgbm,Light Gradient Boosting Machine,0.673,0.7644,0.3947,0.6403,0.4651,0.2558,0.2804,3.287


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 7001)"
4,Transformed data shape,"(442, 7001)"
5,Transformed train set shape,"(309, 7001)"
6,Transformed test set shape,"(133, 7001)"
7,Numeric features,7000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 7 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7796,0.8246,0.6462,0.7833,0.6855,0.5207,0.5456,0.873
ridge,Ridge Classifier,0.7668,0.0,0.6462,0.734,0.6736,0.4956,0.5103,0.577
svm,SVM - Linear Kernel,0.7571,0.0,0.6303,0.7229,0.6638,0.4767,0.4875,0.593
lda,Linear Discriminant Analysis,0.7212,0.7881,0.4932,0.7245,0.5649,0.3742,0.402,0.864
knn,K Neighbors Classifier,0.7052,0.7052,0.3924,0.6985,0.484,0.3124,0.343,0.622
nb,Naive Bayes,0.7019,0.7566,0.5773,0.6209,0.5925,0.3592,0.364,0.618
rf,Random Forest Classifier,0.6957,0.7618,0.2765,0.8361,0.3912,0.2623,0.3406,0.927
gbc,Gradient Boosting Classifier,0.6955,0.7228,0.4197,0.6699,0.5086,0.3078,0.3286,13.799
et,Extra Trees Classifier,0.6891,0.755,0.2758,0.7799,0.3951,0.2511,0.3157,0.807
lightgbm,Light Gradient Boosting Machine,0.6759,0.7511,0.4189,0.6274,0.4813,0.2673,0.2884,3.88


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 8001)"
4,Transformed data shape,"(442, 8001)"
5,Transformed train set shape,"(309, 8001)"
6,Transformed test set shape,"(133, 8001)"
7,Numeric features,8000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 8 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7699,0.8304,0.6129,0.7807,0.6614,0.4947,0.5214,0.983
ridge,Ridge Classifier,0.7571,0.0,0.6129,0.7319,0.6491,0.469,0.4876,0.657
svm,SVM - Linear Kernel,0.7312,0.0,0.5879,0.6952,0.6149,0.4149,0.4336,0.678
knn,K Neighbors Classifier,0.7281,0.7208,0.3773,0.8442,0.4906,0.3542,0.413,0.711
lda,Linear Discriminant Analysis,0.7275,0.79,0.4848,0.723,0.5669,0.3834,0.4067,1.008
nb,Naive Bayes,0.7213,0.7605,0.5939,0.6447,0.6131,0.3972,0.4018,0.693
lightgbm,Light Gradient Boosting Machine,0.6986,0.7502,0.4439,0.6726,0.5159,0.3176,0.3405,4.412
et,Extra Trees Classifier,0.6858,0.7883,0.2508,0.8328,0.3642,0.2353,0.3163,0.923
rf,Random Forest Classifier,0.6795,0.7521,0.2174,0.8429,0.3228,0.2111,0.2949,1.037
gbc,Gradient Boosting Classifier,0.6567,0.7317,0.3591,0.5715,0.4376,0.2142,0.2246,15.693


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 9001)"
4,Transformed data shape,"(442, 9001)"
5,Transformed train set shape,"(309, 9001)"
6,Transformed test set shape,"(133, 9001)"
7,Numeric features,9000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 9 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7635,0.8245,0.6045,0.7696,0.6511,0.4802,0.5074,1.134
ridge,Ridge Classifier,0.7442,0.0,0.6045,0.7123,0.6308,0.442,0.4643,0.732
svm,SVM - Linear Kernel,0.731,0.0,0.5447,0.7216,0.5931,0.4042,0.4298,0.755
lda,Linear Discriminant Analysis,0.7308,0.7956,0.4848,0.7522,0.5667,0.3891,0.4215,1.178
knn,K Neighbors Classifier,0.7216,0.7369,0.4356,0.7452,0.5293,0.3589,0.3923,0.814
nb,Naive Bayes,0.7181,0.7613,0.5856,0.6542,0.6082,0.3908,0.4,0.78
lightgbm,Light Gradient Boosting Machine,0.7019,0.7665,0.4614,0.6668,0.5258,0.3274,0.3475,5.035
et,Extra Trees Classifier,0.6859,0.7537,0.2348,0.8631,0.3574,0.233,0.3226,1.055
rf,Random Forest Classifier,0.6667,0.7459,0.2265,0.7638,0.337,0.1926,0.261,1.11
gbc,Gradient Boosting Classifier,0.6468,0.7098,0.3341,0.571,0.4168,0.1896,0.2052,17.661


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 10001)"
4,Transformed data shape,"(442, 10001)"
5,Transformed train set shape,"(309, 10001)"
6,Transformed test set shape,"(133, 10001)"
7,Numeric features,10000
8,Preprocess,True
9,Imputation type,simple


---------------------------TESTING THE COMBINATION OF 10 GRADIENTS---------------------------


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.757,0.8209,0.5955,0.7487,0.6424,0.4657,0.4883,1.243
ridge,Ridge Classifier,0.7474,0.0,0.6045,0.7237,0.6359,0.4491,0.4712,0.836
svm,SVM - Linear Kernel,0.7441,0.0,0.5955,0.7127,0.6281,0.44,0.4569,0.845
lda,Linear Discriminant Analysis,0.734,0.8031,0.4848,0.7632,0.5671,0.3947,0.4286,1.28
nb,Naive Bayes,0.7181,0.755,0.5939,0.6509,0.6123,0.393,0.4011,0.873
knn,K Neighbors Classifier,0.7152,0.7293,0.4356,0.7197,0.5203,0.345,0.3749,0.884
rf,Random Forest Classifier,0.6665,0.7309,0.2174,0.769,0.3253,0.1879,0.2589,1.239
et,Extra Trees Classifier,0.6665,0.7152,0.1591,0.8,0.2545,0.166,0.256,1.177
lightgbm,Light Gradient Boosting Machine,0.6632,0.7331,0.3856,0.6115,0.4566,0.236,0.2559,5.617
gbc,Gradient Boosting Classifier,0.6533,0.7041,0.3508,0.586,0.4345,0.2075,0.2238,19.682


Transformation Pipeline and Model Successfully Saved


In [111]:
cobre_grads = []

for subject in participants_cobre_scz['participant_id']:
    try:
        grad1 = np.load(f'{path_cobre_scz}/sub-{subject}/func/aligned-10gradients-sub-{subject}-rest-schaefer1000.npy')[0][:, 0]
        grad2 = np.load(f'{path_cobre_scz}/sub-{subject}/func/aligned-10gradients-sub-{subject}-rest-schaefer1000.npy')[0][:, 2]
        grad = np.concatenate((grad1, grad2))
        grad = pd.DataFrame(grad).T
        grad["participant_id"] = subject
        cobre_grads.append(grad)
    except FileNotFoundError as e:
        print(f"Gradients not found for subject {subject}: {e}.")
        participants_cobre_scz = participants_cobre_scz[participants_cobre_scz['participant_id'] != subject]
        continue

for subject in participants_cobre_controls['participant_id']:
    try:
        grad1 = np.load(f'{path_cobre_controls}/sub-{subject}/func/aligned-10gradients-sub-{subject}-rest-schaefer1000.npy')[0][:, 0]
        grad2 = np.load(f'{path_cobre_controls}/sub-{subject}/func/aligned-10gradients-sub-{subject}-rest-schaefer1000.npy')[0][:, 2]
        grad = np.concatenate((grad1, grad2))
        grad = pd.DataFrame(grad).T
        grad["participant_id"] = subject
        cobre_grads.append(grad)
    except FileNotFoundError as e:
        print(f"Matrix not found for subject {subject}: {e}.")
        participants_cobre_controls = participants_cobre_controls[participants_cobre_controls['participant_id'] != subject]
        continue

In [112]:
la5c_grads = []
for subject in participants_la5c['participant_id']:
    try:
        grad1 = np.load(f'{path_la5c}/{subject}/func/aligned-10gradients-{subject}-rest-schaefer1000.npy')[0][:, 0]
        grad2 = np.load(f'{path_la5c}/{subject}/func/aligned-10gradients-{subject}-rest-schaefer1000.npy')[0][:, 2]
        grad = np.concatenate((grad1, grad2))
        grad = pd.DataFrame(grad).T
        grad["participant_id"] = subject
        la5c_grads.append(grad)
    except FileNotFoundError as e: #ouput "matrix not found", remove the subject from the dataframe and continue
        print(f"Gradients not found for subject {subject}: {e}.")
        participants_la5c = participants_la5c[participants_la5c['participant_id'] != subject].reset_index(drop=True)
        continue

In [113]:
ktt_grads = []
for subject in participants_ktt['participant_id']:
    try:
        grad1 = np.load(f'{path_ktt}/{subject}/func/aligned-10gradients-{subject}-rest-schaefer1000.npy')[0][:, 0]
        grad2 = np.load(f'{path_ktt}/{subject}/func/aligned-10gradients-{subject}-rest-schaefer1000.npy')[0][:, 2]
        grad = np.concatenate((grad1, grad2))
        grad = pd.DataFrame(grad).T
        grad["participant_id"] = subject
        ktt_grads.append(grad)
    except FileNotFoundError as e: #ouput "matrix not found", remove the subject from the dataframe and continue
        print(f"Gradients not found for subject {subject}: {e}.")
        participants_ktt = participants_ktt[participants_ktt['participant_id'] != subject].reset_index(drop=True)
        continue

In [114]:
cobre_grads = pd.concat(cobre_grads, ignore_index=True)
la5c_grads = pd.concat(la5c_grads, ignore_index=True)
ktt_grads = pd.concat(ktt_grads, ignore_index=True)

In [115]:
grads = pd.concat([cobre_grads, la5c_grads, ktt_grads], ignore_index=True, axis = 0)

In [116]:
participants = pd.concat([participants_cobre_scz, participants_cobre_controls, participants_la5c, participants_ktt], ignore_index=True, axis = 0)

In [117]:
data_grad = pd.merge(grads, participants, on='participant_id')
data_grad = data_grad.drop(columns=['participant_id', 'dataset'])

In [118]:
data_grad

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,diagnosis
0,-1.085287,-1.899534,1.551088,-0.568496,0.912087,-0.764685,-1.623822,-0.351781,-0.708447,-0.864799,...,-0.164501,-1.992526,-0.438555,-0.944984,-2.108790,-1.809188,-1.937727,-0.542517,-1.800789,1
1,0.880812,-0.995200,0.313740,-1.843073,0.640820,-2.321203,-1.626162,-0.656469,-2.283416,-2.754447,...,-0.456323,-1.049693,0.350130,-1.293241,-2.229483,-1.039149,-1.713071,-0.333480,-0.048532,1
2,-1.178170,-1.078775,-1.066179,-0.589453,-1.502057,-1.316850,-0.166793,-0.369926,-0.084310,-1.896288,...,1.844938,-0.206867,0.576732,-1.341111,-1.174386,-1.253823,-0.504910,1.077233,-0.448108,1
3,0.751678,-0.476395,-0.862251,-2.870346,0.075747,-2.927635,-2.781976,-2.075118,-1.578347,-0.571861,...,-0.207595,-1.551838,-0.510801,-1.052190,-2.101152,-0.114787,-0.436519,0.936376,0.999620,1
4,0.600807,1.091632,1.088720,0.614737,1.551320,1.430642,1.532772,1.429613,1.729258,-0.938351,...,-0.141138,-1.258181,2.539389,-1.278720,-0.729533,4.163509,-1.226767,-0.326552,-0.036043,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,-1.650203,-2.276065,0.150079,-1.192997,1.285092,-0.319191,-2.358506,-0.305442,-1.693796,-1.330515,...,1.460811,0.445554,-0.736792,0.898829,0.367090,0.566130,1.978469,0.576747,0.765413,0
438,-0.302474,-0.248823,0.403272,0.852552,-0.879313,-1.702773,1.299584,0.387610,-1.591730,0.353537,...,0.056117,-0.625227,-0.781440,-0.310465,-0.789679,-0.825177,-0.294901,0.214933,-0.384255,0
439,-1.254661,-1.768036,-0.211371,-2.657617,1.287597,-0.870516,0.536357,-3.067705,-2.257884,-2.172788,...,0.372770,-1.386503,-0.327097,-0.872699,-1.846247,-0.564955,-1.965100,0.134874,0.972302,0
440,-2.153528,-2.654739,-2.063455,-2.855273,-0.472108,-1.115471,-2.419532,-2.368633,-0.401775,0.315276,...,-0.529554,0.201305,2.389103,0.369862,0.462510,0.774002,1.120081,1.624812,0.863619,0


In [119]:
s = setup(data_grad, target = 'diagnosis', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 2001)"
4,Transformed data shape,"(442, 2001)"
5,Transformed train set shape,"(309, 2001)"
6,Transformed test set shape,"(133, 2001)"
7,Numeric features,2000
8,Preprocess,True
9,Imputation type,simple


In [120]:
top_10_models_grad = compare_models(fold = 10, n_select = 10)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7149,0.7503,0.5364,0.6749,0.5894,0.3762,0.388,0.234
lightgbm,Light Gradient Boosting Machine,0.7052,0.7293,0.4765,0.6584,0.532,0.3347,0.3517,1.114
rf,Random Forest Classifier,0.6989,0.7532,0.3265,0.7892,0.4355,0.2845,0.3437,0.436
svm,SVM - Linear Kernel,0.6957,0.0,0.503,0.6662,0.5553,0.3333,0.3513,0.185
ridge,Ridge Classifier,0.6955,0.0,0.5447,0.6302,0.5806,0.3432,0.3488,0.192
lda,Linear Discriminant Analysis,0.6955,0.7544,0.5189,0.6288,0.5541,0.3304,0.343,0.701
et,Extra Trees Classifier,0.6859,0.7476,0.3341,0.6866,0.4312,0.26,0.2961,0.463
gbc,Gradient Boosting Classifier,0.6759,0.7272,0.4258,0.6022,0.4841,0.2672,0.279,4.431
nb,Naive Bayes,0.6601,0.7258,0.6705,0.5522,0.5995,0.3101,0.3214,0.202
knn,K Neighbors Classifier,0.6244,0.6478,0.478,0.5178,0.4858,0.1942,0.2015,0.198


In [72]:
top_10_models_grad

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
               early_stopping=False, epsilon=0.1, eta0=0.001, fit_intercept=True,
               l1_ratio=0.15, learning_rate='optimal', loss='hinge',
               max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l2',
               power_t=0.5, random_state=123, shuffle=True, tol=0.001,
               validation_fraction=0.1, verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='sqrt',
                      max_leaf_nodes=None, max_samples=None,
                      min_i