In [1]:
"""
Date: 10.03.2023
Author: Reto Hendry


"""


'\nDate: 10.03.2023\nAuthor: Reto Hendry\n\n\n'

In [2]:

#%% import libraries
import os
import numpy as np
import pandas as pd
import nibabel as nib
import time
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

#%% filepath
# path to data
#  - on Windows: C:/Users/tahendry/Desktop/Masterthesis_Reto/
#  - on Linux:   ../data/

try:
    print(os.uname())
    data_path = "../data/"
except:
    print("Windows")
    data_path = "C:/Users/tahendry/Desktop/Masterthesis_Reto/"

# read in the excel-file with the labels
label_file = "Conn_IDs_Matching.xlsx"

# read excel with only the first three columns
label_df = (pd.read_excel(os.path.join(data_path, label_file),
                            usecols=[0, 1, 2])
            .replace({"Cond": {1: 0}})
            .replace({"Cond": {2: 1}})
            )

label_df.head()

# read MVPA data
path_content = os.listdir(os.path.join(data_path, "Denoised_Data_6mm", "MVPA_data"))

# make two lists with pre (Condition002) and post (Condition003) data of first component
comp1_pre = sorted([x for x in path_content 
                    if "Component001" in x 
                    and "Condition002" in x])
comp1_post = sorted([x for x in path_content 
                    if "Component001" in x 
                    and "Condition003" in x])

print(comp1_pre[:5])

posix.uname_result(sysname='Linux', nodename='dalcowks', release='5.13.0-40-generic', version='#45~20.04.1-Ubuntu SMP Mon Apr 4 09:38:31 UTC 2022', machine='x86_64')
['BETA_Subject001_Condition002_Measure002_Component001.nii', 'BETA_Subject002_Condition002_Measure002_Component001.nii', 'BETA_Subject003_Condition002_Measure002_Component001.nii', 'BETA_Subject004_Condition002_Measure002_Component001.nii', 'BETA_Subject005_Condition002_Measure002_Component001.nii']


In [3]:
#%% prepare data
# create a dataset with the difference of pre and post data
comp1_diff = []
for pre, post in zip(comp1_pre, comp1_post):
    pre_vol = nib.load(os.path.join(data_path, "Denoised_Data_6mm", "MVPA_data", pre))
    post_vol = nib.load(os.path.join(data_path, "Denoised_Data_6mm", "MVPA_data", post))
    pre_vol_data = pre_vol.get_fdata()
    post_vol_data = post_vol.get_fdata()
    diff_vol_data = post_vol_data - pre_vol_data
    comp1_diff.append(diff_vol_data)

# check the shape of the data
print(comp1_diff[0].shape)

# check the type of the data
print(type(comp1_diff[0]))

# stack the data to later use it as input for the CNN
# note: the first dimension is the number of samples
print(f"shape of one list element before stacking: {comp1_diff[0].shape=}")
inpt_comp1_diff = np.stack(comp1_diff, axis=0)

# expand the dimensions to fit the input shape of the CNN
# note: the last dimension is the number of channels
inpt_comp1_diff = np.expand_dims(inpt_comp1_diff, axis=-1)

# normalize the input data (zero mean, unit variance)
# note: a CNN works best with normalized data
# inpt_comp1_diff = (inpt_comp1_diff - inpt_comp1_diff.mean()) / inpt_comp1_diff.std()

# transform the data from a 3d array to a 2d array
x_dim = inpt_comp1_diff.shape[1]  # 91
y_dim = inpt_comp1_diff.shape[2]  # 109
z_dim = inpt_comp1_diff.shape[3]  # 91
num_samples = inpt_comp1_diff.shape[0]
inpt_comp1_diff = inpt_comp1_diff.reshape(num_samples, x_dim*y_dim*z_dim)


print(f"{inpt_comp1_diff.shape=}",
      f"{inpt_comp1_diff.mean()=}",
      f"{inpt_comp1_diff.std()=}", sep="\n")

x_train, x_test, y_train, y_test = train_test_split(inpt_comp1_diff, 
                                                    label_df["Cond"], 
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=label_df["Cond"]
                                                    )

# check the shape of the data
print(f"{x_train.shape=}",
        f"{x_test.shape=}",
        f"{y_train.shape=}",
        f"{y_test.shape=}", sep="\n")



(91, 109, 91)
<class 'numpy.ndarray'>
shape of one list element before stacking: comp1_diff[0].shape=(91, 109, 91)
inpt_comp1_diff.shape=(68, 902629)
inpt_comp1_diff.mean()=0.0005298948701831914
inpt_comp1_diff.std()=0.22370761343748158
x_train.shape=(54, 902629)
x_test.shape=(14, 902629)
y_train.shape=(54,)
y_test.shape=(14,)


In [4]:
#%% define the autoML class
# Define the cross-validation strategy
cv_stratified = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def tpot_class(nbr_generations, nbr_population_size, max_time=None):
    tpot = TPOTClassifier(generations=nbr_generations,  # number of iterations for optimization
                        population_size=nbr_population_size,  # default 100
                        max_time_mins=max_time,
                        early_stop=5,
                        scoring="accuracy",
                        cv=cv_stratified,  # cross validation fold (default)
                        n_jobs=-1, # nbr. of cores used (-1 = all)
                        max_eval_time_mins=10,  # default 5
                        random_state=1,  # seed
                        memory=False,  # avoid fitting same model
                        periodic_checkpoint_folder="./tpot_folder",
                        verbosity=3,  # print minimal info
                        log_file="tpot_log",
                        )
    return tpot


In [5]:
# Train the model incrementally and evaluate the best pipelines after each iteration
# start timer to measure the time
parameter_list = [8, 10]

for i in parameter_list:
    tpot = tpot_class(nbr_generations=i, nbr_population_size=i)
    start = time.time()
    tpot.fit(x_train, y_train)
    print(f"generation and pop_size: {i}",
          f"trained pipelines: {len(tpot.evaluated_individuals_)}",
          f"best model: {tpot.fitted_pipeline_}",
          f"best model score: {tpot.score(x_test, y_test)}",
          f"time needed: {(time.time() - start)/60:.2f} mins",
          sep="\n", end="\n\n")


32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/6 [00:00<?, ?pipeline/s]



generation and pop_size: 2
trained pipelines: 6
best model: Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.001,
                                                                        max_depth=8,
                                                                        max_features=0.7500000000000001,
                                                                        min_samples_leaf=16,
                                                                        min_samples_split=14,
                                                                        random_state=1,
                                                                        subsample=0.2))),
                ('rbfsampler', RBFSampler(gamma=0.4, random_state=1)),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(criterion='entropy', max_depth=10,
                                        min_samples_leaf=12,
    

Optimization Progress:   0%|          | 0/20 [00:00<?, ?pipeline/s]



generation and pop_size: 4
trained pipelines: 18
best model: Pipeline(steps=[('rbfsampler', RBFSampler(gamma=0.4, random_state=1)),
                ('nystroem',
                 Nystroem(gamma=0.15000000000000002, kernel='poly',
                          n_components=9, random_state=1)),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(criterion='entropy', max_depth=10,
                                        min_samples_leaf=12,
                                        min_samples_split=7, random_state=1))])
best model score: 0.5714285714285714
time needed: 21.36 mins

32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/42 [00:00<?, ?pipeline/s]



generation and pop_size: 6
trained pipelines: 39
best model: Pipeline(steps=[('mlpclassifier',
                 MLPClassifier(alpha=0.001, learning_rate_init=0.01,
                               random_state=1))])
best model score: 0.5714285714285714
time needed: 44.28 mins

32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/72 [00:00<?, ?pipeline/s]



generation and pop_size: 8
trained pipelines: 52
best model: Pipeline(steps=[('nystroem',
                 Nystroem(gamma=0.15000000000000002, kernel='poly',
                          n_components=9, random_state=1)),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(learning_rate=1.0, max_depth=9,
                                            max_features=0.7000000000000001,
                                            min_samples_leaf=7,
                                            min_samples_split=6, random_state=1,
                                            subsample=0.6000000000000001))])
best model score: 0.7142857142857143
time needed: 52.03 mins

32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/110 [00:00<?, ?pipeline/s]

In [None]:
#%% fit the model
tpot.fit(x_train, y_train)
print("done with fitting")


In [None]:
#%% evaluate the model
print("pipeline: /n", tpot.fitted_pipeline_)
print(tpot.score(x_test, y_test))
tpot.export('tpot_pipeline.py')
print("best pipeline exported")

