In [1]:
%pip install catboost mne colorlog

Collecting catboost
  Downloading catboost-1.2.5-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting mne
  Downloading mne-1.7.1-py3-none-any.whl.metadata (13 kB)
Collecting colorlog
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-5.22.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pooch>=1.5 (from mne)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting tenacity>=6.2.0 (from plotly->catboost)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp311-cp311-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[?25hDownloading mne-1.7.1-py3-none-any.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4

In [2]:
import pandas as pd
import numpy as np
import os
import re
import utils

In [None]:
# test catboost

import numpy
from catboost import CatBoostRegressor

dataset = numpy.array([[1,4,5,6],[4,5,6,7],[30,40,50,60],[20,15,85,60]])
train_labels = [1.2,3.4,9.5,24.5]
model = CatBoostRegressor(learning_rate=1, depth=6, loss_function='RMSE')
fit_model = model.fit(dataset, train_labels)

print(fit_model.get_params())

In [3]:
def get_labels(directory, metadata_csv):
    metadata = pd.read_csv(metadata_csv, index_col=0, header=None)

    labels = {0: 'sham', 1: 'ctbs', 2: 'itbs'}
    data = []

    for filename in os.listdir(directory):
        # note that the s can be upper or lower case and that the letter b can be behind the session number
        match = re.match(r'TMS-EEG-H_(\d+)_(S|s)(\w+)(b?)_(rsEEG|spTEP)_(pre|post)-epo.fif', filename)
        if match:
            patient_id, _, session, _, eeg_type, pre_post = match.groups()
            session = int(session.rstrip('b'))

            # Get the procedure for the session from the metadata
            procedure = labels[metadata.loc[f'H{patient_id}'][session]]

            data.append([filename, procedure, patient_id, eeg_type, pre_post])

    df = pd.DataFrame(data, columns=['filename', 'procedure', 'patient_id', 'eeg_type', 'pre_post'])
    return df

get_labels("dataset-cleaned", "Randomisatielijst.csv")

Unnamed: 0,filename,procedure,patient_id,eeg_type,pre_post
0,TMS-EEG-H_06_S3_spTEP_post-epo.fif,sham,06,spTEP,post
1,TMS-EEG-H_14_S2_rsEEG_post-epo.fif,itbs,14,rsEEG,post
2,TMS-EEG-H_15_S1_spTEP_pre-epo.fif,ctbs,15,spTEP,pre
3,TMS-EEG-H_08_S2_rsEEG_pre-epo.fif,itbs,08,rsEEG,pre
4,TMS-EEG-H_16_S2_spTEP_pre-epo.fif,sham,16,spTEP,pre
...,...,...,...,...,...
171,TMS-EEG-H_15_S3_spTEP_post-epo.fif,itbs,15,spTEP,post
172,TMS-EEG-H_07_S2_rsEEG_post-epo.fif,ctbs,07,rsEEG,post
173,TMS-EEG-H_13_S3_spTEP_post-epo.fif,sham,13,spTEP,post
174,TMS-EEG-H_08_s1_spTEP_post-epo.fif,sham,08,spTEP,post


In [50]:
train_filenames, test_filenames = utils.get_train_test_split("./features", random_state=0)
print(len(train_filenames), len(test_filenames))

142 36


In [51]:
def get_data(directory, filenames):
    """Returns a df containing all rows from the given feature files."""
    data = []
    for filename in filenames:
        df = pd.read_csv(os.path.join(directory, filename), header=[0,1])
        data.append(df)
    df = pd.concat(data)
    return df

def get_labels(label_name, label_df, filenames):
    """Returns a df column containg the labels for the given filenames. This is achieved by repeating the respective label for each row in the feature file."""
    labels = []
    print(filenames)
    for filename in filenames:
        df = pd.read_csv(os.path.join("features", filename), header=[0,1])
        rows = df.shape[0]
        filename = filename.split(".")[0]
        label = label_df[label_df['filename'] == filename][label_name].values[0]
        print(f'rows: {rows}, label: {label}')
        # add label * rows times
        labels.extend([label] * rows)
    df = pd.DataFrame(labels, columns=[label_name])
    return df

requirements = ["rsEEG"]
labels_df = pd.read_csv("labels.csv")

train_files = []
for filename in train_filenames:
    if all(x in filename for x in requirements):
        train_files.append(filename)
test_files = []
for filename in test_filenames:
    if all(x in filename for x in requirements):
        test_files.append(filename)

train_data_df = get_data("features", train_files)
train_labels_df = get_labels("timing", labels_df, train_files)

test_data_df = get_data("features", test_files)
test_labels_df = get_labels("timing", labels_df, test_files)

train_labels_df

['TMS-EEG-H_07_S1_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_07_S1_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_07_S2_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_07_S2_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_07_S3_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_07_S3_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_08_s1_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_08_S2_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_08_s1_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_08_S3_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_08_S3_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_08_S2_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_11_S2_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_11_S3_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_11_S1_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_11_S2_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_11_S1_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_04_S1b_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_04_S1b_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_04_S2_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_04_S2_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_04_S3_rsEEG_pre-epo.fif.csv', 'TMS-EEG-H_04_S3_rsEEG_post-epo.fif.csv', 'TMS-EEG-H_15_S2_rsEEG_pre-epo.fif.csv', 'T

Unnamed: 0,timing
0,0
1,0
2,0
3,0
4,0
...,...
15809,0
15810,0
15811,0
15812,0


In [52]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore

def remove_outliers_zscore(df, threshold=3):
    z_scores = np.abs(zscore(df))
    return df[(z_scores < threshold).all(axis=1)]

# Standardization
scaler = StandardScaler()
train_data_df = pd.DataFrame(scaler.fit_transform(train_data_df), columns=train_data_df.columns)
test_data_df = pd.DataFrame(scaler.fit_transform(test_data_df), columns=test_data_df.columns)

# outlier removal
train_data_df_clean = remove_outliers_zscore(train_data_df)
train_labels_df = train_labels_df[train_data_df.index]
train_data_df = train_data_df_clean

In [57]:
import numpy as np
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool

# initialize data
train_data = train_data_df
train_labels = train_labels_df

test_data = catboost_pool = Pool(test_data_df,
                                 test_labels_df)

model = CatBoostClassifier(iterations=75,
                           depth=7,
                           learning_rate=0.3,
                           loss_function='Logloss',
                           verbose=True)
# train the model
model.fit(train_data, train_labels)

# make the prediction using the resulting model
preds_class = model.predict(test_data)
preds_proba = model.predict_proba(test_data)
print("class = ", preds_class)
print("proba = ", preds_proba)

accuracy = accuracy_score(test_labels_df, preds_class)
print("Accuracy on test set: ", accuracy)

0:	learn: 0.6368123	total: 361ms	remaining: 26.7s
1:	learn: 0.5910076	total: 642ms	remaining: 23.4s
2:	learn: 0.5506468	total: 935ms	remaining: 22.4s
3:	learn: 0.5257770	total: 1.2s	remaining: 21.3s
4:	learn: 0.4949464	total: 1.47s	remaining: 20.6s
5:	learn: 0.4679550	total: 1.81s	remaining: 20.8s
6:	learn: 0.4431094	total: 2.2s	remaining: 21.4s
7:	learn: 0.4248396	total: 2.47s	remaining: 20.7s
8:	learn: 0.4051298	total: 2.8s	remaining: 20.5s
9:	learn: 0.3909678	total: 3.18s	remaining: 20.7s
10:	learn: 0.3725797	total: 3.51s	remaining: 20.4s
11:	learn: 0.3577107	total: 3.82s	remaining: 20s
12:	learn: 0.3470072	total: 4.1s	remaining: 19.5s
13:	learn: 0.3351306	total: 4.37s	remaining: 19s
14:	learn: 0.3234696	total: 4.64s	remaining: 18.6s
15:	learn: 0.3099700	total: 4.91s	remaining: 18.1s
16:	learn: 0.3001859	total: 5.18s	remaining: 17.7s
17:	learn: 0.2912856	total: 5.46s	remaining: 17.3s
18:	learn: 0.2811322	total: 5.75s	remaining: 16.9s
19:	learn: 0.2718361	total: 6.01s	remaining: 16.5

In [38]:
# Lasso

import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

train_data = train_data_df
train_labels = train_labels_df

test_data = test_data_df
test_labels = test_labels_df

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data)
X_test_scaled = scaler.transform(test_data)

lasso = Lasso(alpha=0.7)
lasso.fit(X_train_scaled, train_labels)

y_pred_train = lasso.predict(X_train_scaled)
y_pred_test = lasso.predict(X_test_scaled)

mse_train = mean_squared_error(train_labels, y_pred_train)
mse_test = mean_squared_error(test_labels, y_pred_test)

r2_train = r2_score(train_labels, y_pred_train)
r2_test = r2_score(test_labels, y_pred_test)

print(f'Training MSE: {mse_train}')
print(f'Testing MSE: {mse_test}')
print(f'Training R^2: {r2_train}')
print(f'Testing R^2: {r2_test}')

# Assuming binary classification with threshold 0.5
threshold = 0.5
preds_class_train = (y_pred_train >= threshold).astype(int)
preds_class_test = (y_pred_test >= threshold).astype(int)

accuracy_train = accuracy_score(train_labels, preds_class_train)
accuracy_test = accuracy_score(test_labels, preds_class_test)

print(f'Training Accuracy: {accuracy_train}')
print(f'Testing Accuracy: {accuracy_test}')

Training MSE: 0.24980470052382214
Testing MSE: 0.25239942324731024
Training R^2: 0.0
Testing R^2: -0.03535254955919154
Training Accuracy: 0.5139749588971797
Testing Accuracy: 0.4211402413649605


In [58]:
# Random search
param_dist = {
    'iterations': randint(50, 200),
    'depth': randint(4, 10),
    'learning_rate': uniform(0.01, 0.3),
    'l2_leaf_reg': uniform(1, 10),
    'border_count': randint(32, 255)
}

model = CatBoostClassifier(loss_function='Logloss', verbose=0)

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=2,
    scoring='accuracy',
    cv=5,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

random_search.fit(train_data, train_labels)

print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation accuracy: {random_search.best_score_}")

Fitting 5 folds for each of 2 candidates, totalling 10 fits








Best parameters found: {'border_count': 134, 'depth': 7, 'iterations': 142, 'l2_leaf_reg': 2.8343478986616377, 'learning_rate': 0.24390730008183079}
Best cross-validation accuracy: 0.5521065938129099


NameError: name 'X_test' is not defined

In [61]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(test_data)
accuracy = accuracy_score(test_labels, y_pred)
print(f"Test set accuracy: {accuracy}")

Test set accuracy: 0.47503121098626716
