In [None]:
%pip install catboost

In [None]:
import pandas as pd
import numpy as np
import os
import re

In [None]:
# test catboost

import numpy
from catboost import CatBoostRegressor

dataset = numpy.array([[1,4,5,6],[4,5,6,7],[30,40,50,60],[20,15,85,60]])
train_labels = [1.2,3.4,9.5,24.5]
model = CatBoostRegressor(learning_rate=1, depth=6, loss_function='RMSE')
fit_model = model.fit(dataset, train_labels)

print(fit_model.get_params())

In [None]:
def get_labels(directory, metadata_csv):
    metadata = pd.read_csv(metadata_csv, index_col=0, header=None)

    labels = {0: 'sham', 1: 'ctbs', 2: 'itbs'}
    data = []

    for filename in os.listdir(directory):
        # note that the s can be upper or lower case and that the letter b can be behind the session number
        match = re.match(r'TMS-EEG-H_(\d+)_(S|s)(\w+)(b?)_(rsEEG|spTEP)_(pre|post)-epo.fif', filename)
        if match:
            patient_id, _, session, _, eeg_type, pre_post = match.groups()
            session = int(session.rstrip('b'))

            # Get the procedure for the session from the metadata
            procedure = labels[metadata.loc[f'H{patient_id}'][session]]

            data.append([filename, procedure, patient_id, eeg_type, pre_post])

    df = pd.DataFrame(data, columns=['filename', 'procedure', 'patient_id', 'eeg_type', 'pre_post'])
    return df

get_labels("dataset-cleaned", "Randomisatielijst.csv")

In [None]:
from sklearn.model_selection import train_test_split

def get_train_test_split(directory, test_size=0.2):
    # Dictionary to hold participant IDs as keys and a list of corresponding filenames as values
    participants_files = {}

    # List all files in the directory
    for filename in os.listdir(directory):
        # Extract participant ID from the filename
        match = re.search(r'H_\d+', filename)
        if match:
            participant_id = match.group(0)
            if participant_id not in participants_files:
                participants_files[participant_id] = []
            participants_files[participant_id].append(filename)

    train_files = []
    test_files = []

    # Split files for each participant into training and testing
    for participant_id, files in participants_files.items():
        train, test = train_test_split(files, test_size=test_size, random_state=42)
        train_files.extend(train)
        test_files.extend(test)

    return train_files, test_files

train_files, test_files = get_train_test_split("./dataset-cleaned")

In [None]:
def get_data(directory, filenames):
    """Returns a df containing all rows from the given feature files."""
    data = []
    for filename in filenames:
        df = pd.read_csv(os.path.join(directory, filename), header=[0,1])
        data.append(df)
    df = pd.concat(data)
    return df

def get_labels(label_name, label_df, filenames):
    """Returns a df column containg the labels for the given filenames. This is achieved by repeating the respective label for each row in the feature file."""
    labels = []
    print(filenames)
    for filename in filenames:
        df = pd.read_csv(os.path.join("features", filename), header=[0,1])
        rows = df.shape[0]
        filename = filename.split(".")[0]
        label = label_df[label_df['filename'] == filename][label_name].values[0]
        print(f'rows: {rows}, label: {label}')
        # add label * rows times
        labels.extend([label] * rows)
    df = pd.DataFrame(labels, columns=[label_name])
    return df

files = []
requirements = ["rsEEG"]
for filename in os.listdir("features"):
    if all(x in filename for x in requirements):
        files.append(filename)

data_df = get_data("features", files)

label_df = pd.read_csv("labels.csv")
label_df = get_labels("timing", label_df, files)

label_df

In [None]:
import numpy as np

from catboost import CatBoostClassifier, Pool

# initialize data
train_data = data_df

train_labels = label_df

test_data = catboost_pool = Pool(train_data,
                                 train_labels)

model = CatBoostClassifier(iterations=5,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)
# train the model
model.fit(train_data, train_labels)
# make the prediction using the resulting model
preds_class = model.predict(test_data)
preds_proba = model.predict_proba(test_data)
print("class = ", preds_class)
print("proba = ", preds_proba)
