In [1]:
import numpy as np
import nilearn
import pandas as pd
import os
import pycaret
from nilearn.connectome import sym_matrix_to_vec
import pickle
import json
from modeling_utils import *
from pycaret.classification import *

# Prepare Data for Pycaret

In [2]:
#data_paths = json.load(open('data_paths.json', 'r'))

data_paths = {
"COBRE-SCZ": "/Users/VictoriaShevchenko/Documents/PhD/SCZ/COBRE/clean_data/SCZ",
"COBRE-controls": "/Users/VictoriaShevchenko/Documents/PhD/SCZ/COBRE/clean_data/controls",
"LA5c": "/Users/VictoriaShevchenko/Documents/PhD/SCZ/LA5c/clean_data",
"SRPBS-1600-KTT": "/Users/VictoriaShevchenko/Documents/PhD/SCZ/SRPBS_1600/KTT/clean_data",
}

In [3]:
diagnosis_mapping = {
    'CONTROL': 0,
    'SCHZ': 1,
    'Schizophrenia_Strict': 1,
    'No_Known_Disorder': 0,
    4: 1,
    0: 0
}

In [8]:
participants = prepare_data_csv(data_paths, diag_mapping = diagnosis_mapping)
data = participants.drop(columns=['participant_id', 'path'])

In [9]:
data

Unnamed: 0,diagnosis,age,sex,dataset
1,1,30,0,COBRE
2,1,32,1,COBRE
3,1,34,0,COBRE
4,1,20,1,COBRE
5,1,56,1,COBRE
...,...,...,...,...
554,0,31,0,SRPBS-1600-KTT
555,0,28,0,SRPBS-1600-KTT
556,0,32,0,SRPBS-1600-KTT
557,0,39,0,SRPBS-1600-KTT


# Model Selection

In [10]:
exp = setup(data, target = 'diagnosis', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,diagnosis
2,Target type,Binary
3,Original data shape,"(442, 4)"
4,Transformed data shape,"(442, 6)"
5,Transformed train set shape,"(309, 6)"
6,Transformed test set shape,"(133, 6)"
7,Numeric features,2
8,Categorical features,1
9,Preprocess,True


In [9]:
best_model = compare_models(n_select = 14)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.6376,0.6465,0.4803,0.5074,0.4904,0.2111,0.2128,0.043
dummy,Dummy Classifier,0.6349,0.5,0.0,0.0,0.0,0.0,0.0,0.04
ada,Ada Boost Classifier,0.6163,0.5993,0.2098,0.4867,0.2774,0.068,0.0878,0.076
knn,K Neighbors Classifier,0.6107,0.6131,0.4455,0.4714,0.4507,0.1517,0.1573,0.057
lda,Linear Discriminant Analysis,0.607,0.6435,0.2182,0.4812,0.2947,0.0596,0.0802,0.042
lr,Logistic Regression,0.6069,0.6431,0.2182,0.4783,0.294,0.059,0.0791,0.512
lightgbm,Light Gradient Boosting Machine,0.6045,0.612,0.4023,0.457,0.4205,0.1252,0.1286,0.061
ridge,Ridge Classifier,0.6039,0.0,0.2098,0.4712,0.2856,0.0501,0.0701,0.031
gbc,Gradient Boosting Classifier,0.5982,0.5925,0.3015,0.4029,0.3277,0.0679,0.0684,0.073
et,Extra Trees Classifier,0.5676,0.5718,0.3015,0.3852,0.3314,0.0228,0.0245,0.094


Processing:   0%|          | 0/78 [00:00<?, ?it/s]

In [10]:
pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.6376,0.6465,0.4803,0.5074,0.4904,0.2111,0.2128,0.043
dummy,Dummy Classifier,0.6349,0.5,0.0,0.0,0.0,0.0,0.0,0.04
ada,Ada Boost Classifier,0.6163,0.5993,0.2098,0.4867,0.2774,0.068,0.0878,0.076
knn,K Neighbors Classifier,0.6107,0.6131,0.4455,0.4714,0.4507,0.1517,0.1573,0.057
lda,Linear Discriminant Analysis,0.607,0.6435,0.2182,0.4812,0.2947,0.0596,0.0802,0.042
lr,Logistic Regression,0.6069,0.6431,0.2182,0.4783,0.294,0.059,0.0791,0.512
lightgbm,Light Gradient Boosting Machine,0.6045,0.612,0.4023,0.457,0.4205,0.1252,0.1286,0.061
ridge,Ridge Classifier,0.6039,0.0,0.2098,0.4712,0.2856,0.0501,0.0701,0.031
gbc,Gradient Boosting Classifier,0.5982,0.5925,0.3015,0.4029,0.3277,0.0679,0.0684,0.073
et,Extra Trees Classifier,0.5676,0.5718,0.3015,0.3852,0.3314,0.0228,0.0245,0.094


In [11]:
lr = create_model('lr')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6129,0.6711,0.1667,0.5,0.25,0.07,0.0892
1,0.4516,0.3794,0.0,0.0,0.0,-0.2948,-0.3485
2,0.5161,0.4934,0.25,0.3333,0.2857,-0.069,-0.0706
3,0.5806,0.6776,0.1667,0.4,0.2353,0.0098,0.0116
4,0.7097,0.7127,0.3333,0.8,0.4706,0.3145,0.3717
5,0.6774,0.7259,0.25,0.75,0.375,0.225,0.2868
6,0.6129,0.5746,0.1667,0.5,0.25,0.07,0.0892
7,0.4516,0.3553,0.0,0.0,0.0,-0.2948,-0.3485
8,0.6129,0.625,0.1667,0.5,0.25,0.07,0.0892
9,0.5667,0.6531,0.0,0.0,0.0,-0.1272,-0.2034


In [12]:
lr = pull()

In [22]:
lr.loc["Mean"]["Accuracy"]

0.5792