# Fit and Evaluate XGBoost Model for different data splits

In this experiment we want to fit an XGBoost Model using different data splits. In the previous experiments we split the data by patients.

In [21]:
import pandas as pd
import seaborn as sns
import shap
from matplotlib import pyplot as plt
import monipy.utils.database as database
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### 1. Helping Functions

In [22]:
def load_and_prep_data(window=0, data_filter=4):
    # load data
    df = pd.concat([pd.read_csv(f"data/useable_yes_unknown/filter_{data_filter}/sandor_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/ukt_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/freiburg_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/uka_corvolution_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/uka_klinik_full.csv", index_col=[0]),
             ])
    
    # rename column
    df = df.rename(columns={"window_Unnamed: 2093_level_1": "window"})
    
    # filter for one starting window
    df = df[df["window"] == window]
    
    # get patient specific columns
    df_patients_columns = database.get_all_patients().columns
    
    # drop patient specific columns and other columns
    df = df.drop(columns = list(df_patients_columns))

    columns = ["seizure_id_Unnamed: 2091_level_1",  "window", 
           "timestamp_start_Unnamed: 2094_level_1"]
    df = df.drop(columns=columns)

    return df

In [23]:
def train_test_val_split(df):
    # split the data val, test, train
    val_patients = pd.read_csv("data/patients/val_patients_1.csv", index_col=[0]).reset_index(drop=True)
    test_patients = pd.read_csv("data/patients/test_patients_1.csv", index_col=[0]).reset_index(drop=True)
    
    df_val = df[df["patient_id"].isin(val_patients.iloc[:, 0])].reset_index(drop=True)
    df_test = df[df["patient_id"].isin(test_patients.iloc[:, 0])].reset_index(drop=True)
    df_train = df[~(df["patient_id"].isin(set(val_patients.iloc[:, 0]) | set(test_patients.iloc[:, 0])))].reset_index(drop=True)
    return df_val, df_test, df_train

In [24]:
def create_splits(df_train, df_test, features, time_slice=0):
    data_split = {}
    for split in ["test", "train"]:
        data_split[f"X_{split}"] = eval(f"df_{split}")[[f"{feature}_{time_slice}" for feature in features]]
        data_split[f"y_{split}"] = eval(f"df_{split}")[["seizure"]]
    return data_split["X_train"], data_split["y_train"], data_split["X_test"], data_split["y_test"]

In [25]:
# fit xgboost model
def fit_and_eval(X_train, y_train, X_test, y_test):
    # Define initial hyperparameters
    params = {
        'objective': 'binary:logistic',  # Binary classification objective
        'learning_rate': 0.1,  # Learning rate
        'max_depth': 3, # Maximum depth of each tree
        'n_estimators': 400,
        'subsample': 0.8,  # Subsample ratio of the training instances
        'colsample_bytree': 0.8,  # Subsample ratio of features when constructing each tree
        'scale_pos_weight': 0.5,
    }

    # Create the XGBoost classifier
    model = XGBClassifier(**params)

    # Train the XGBoost model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # return evaluation
    return model, (accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred), classification_report(y_test, y_pred),recall_score(y_test, y_pred),recall_score(y_test, y_pred, pos_label=0))

In [26]:
# fit with custom params
def fit(X_train, y_train, X_test, y_test, params):
    # Create the XGBoost classifier
    model = XGBClassifier(**params)

    # Train the XGBoost model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # return evaluation
    return model, (accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred), classification_report(y_test, y_pred),recall_score(y_test, y_pred),recall_score(y_test, y_pred, pos_label=0))

In [27]:
features = {'avg',
 'csi',
 'csi_filtered',
 'csi_filtered_slope',
 'csi_slope',
 'csim',
 'csim_filtered',
 'csim_filtered_slope',
 'csim_slope',
 'cvi',
 'hf',
 'hr_diff',
 'hr_diff_filtered',
 'hr_diff_filtered_slope',
 'hr_diff_slope',
 'kurt',
 'lf',
 'lf_hf_ratio',
 'mf_coef_center',
 'mf_coef_left',
 'mf_coef_right',
 'mf_hurst_max',
 'nnx',
 'pnnx',
 'quantile_25',
 'quantile_50',
 'quantile_75',
 'rmssd',
 'rmssd_dt',
 'sd',
 'sd1',
 'sd2',
 'skew',
 'total_power',
 'triangular_index',
 'ulf',
 'variance',
 'vlf'}

In [28]:
patient_columns = [
    'id',
    'local_patient_id',
    'recording_center',
    'name',
    'firstname',
    'birthday',
    'sex',
    'weight',
    'height',
    'comment',
    'do_ignore',
    'use_for_training',
    'epilepsy_onset'
 ]

### 2. Load Data

In [29]:
df = load_and_prep_data()

[creating new connection]


### 3.1 Splitting by Record_id

In [30]:
test_records = pd.read_csv("data/test_records_1.csv", index_col=[0])
test_records_list = list(test_records["0"])

In [31]:
df.record_id.isin(test_records_list)

1        True
3        True
5       False
7       False
9       False
        ...  
1465     True
1467     True
1469     True
1471     True
1473     True
Name: record_id, Length: 3320, dtype: bool

In [32]:
df_test = df[df.record_id.isin(test_records_list)]
df_train = df[~df.record_id.isin(test_records_list)]

In [33]:
print(df_test.shape)
print(df_train.shape)

(592, 2093)
(2728, 2093)


In [34]:
X_train, y_train, X_test, y_test = create_splits(df_train, df_test, features, 0)

### 3.2 GridSearch

In [35]:
def get_best_params(X_train, y_train):
    model = XGBClassifier()

    # Define the hyperparameter grid for grid search
    param_grid = {
        'objective': ['binary:logistic'],
        'n_estimators': [200, 300, 400],
        'learning_rate': [0.5, 0.1, 0.01],
        'max_depth': [3, 5, 7],
        'scale_pos_weight':[0.1, 0.2, 0.9, 0.5],
    }

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

    # Fit the data to perform grid search
    grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

    # Print the best hyperparameters and the corresponding score
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Score: ", grid_search.best_score_)

In [36]:
get_best_params(X_train, y_train)

Best Hyperparameters:  {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'objective': 'binary:logistic', 'scale_pos_weight': 0.1}
Best Score:  0.9574861881792575


### 3.3 Fit and Evaluate XGBoost model

In [37]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_train, y_train, X_test, y_test)

In [38]:
# performance on test set
display(accuracy)
display(cm)
print(cr)

0.8868243243243243

array([[480,  12],
       [ 55,  45]])

              precision    recall  f1-score   support

           0       0.90      0.98      0.93       492
           1       0.79      0.45      0.57       100

    accuracy                           0.89       592
   macro avg       0.84      0.71      0.75       592
weighted avg       0.88      0.89      0.87       592



In [39]:
# performance on test set
display(accuracy)
display(cm)
print(cr)

0.8868243243243243

array([[480,  12],
       [ 55,  45]])

              precision    recall  f1-score   support

           0       0.90      0.98      0.93       492
           1       0.79      0.45      0.57       100

    accuracy                           0.89       592
   macro avg       0.84      0.71      0.75       592
weighted avg       0.88      0.89      0.87       592



### 4 Splitting by recording center

In [42]:
df_sandor = pd.read_csv(f"data/useable_yes/filter_4/sandor_full.csv", index_col=[0])
df_ukt_movisens = pd.read_csv(f"data/useable_yes/filter_4/ukt_full.csv", index_col=[0])
df_freiburg = pd.read_csv(f"data/useable_yes/filter_4/freiburg_full.csv", index_col=[0])
df_uka_corvolution = pd.read_csv(f"data/useable_yes/filter_4/uka_corvolution_full.csv", index_col=[0])
df_uka_klinik = pd.read_csv(f"data/useable_yes/filter_4/uka_klinik_full.csv", index_col=[0])
      

# rename column
df_sandor = df_sandor.rename(columns={"window_Unnamed: 2093_level_1": "window"})
df_ukt_movisens = df_ukt_movisens.rename(columns={"window_Unnamed: 2093_level_1": "window"})
df_freiburg = df_freiburg.rename(columns={"window_Unnamed: 2093_level_1": "window"})
df_uka_corvolution = df_uka_corvolution.rename(columns={"window_Unnamed: 2093_level_1": "window"})
df_uka_klinik = df_uka_klinik.rename(columns={"window_Unnamed: 2093_level_1": "window"})

# filter for one starting window
df_sandor = df_sandor[df_sandor["window"] == 0]
df_ukt_movisens = df_ukt_movisens[df_ukt_movisens["window"] == 0]
df_freiburg = df_freiburg[df_freiburg["window"] == 0]
df_uka_corvolution = df_uka_corvolution[df_uka_corvolution["window"] == 0]
df_uka_klinik = df_uka_klinik[df_uka_klinik["window"] == 0]

# get patient specific columns
df_patients_columns = database.get_all_patients().columns

# drop patient specific columns and other columns
df_sandor = df_sandor.drop(columns = list(df_patients_columns))
df_ukt_movisens = df_ukt_movisens.drop(columns = list(df_patients_columns))
df_freiburg = df_freiburg.drop(columns = list(df_patients_columns))
df_uka_corvolution = df_uka_corvolution.drop(columns = list(df_patients_columns))
df_uka_klinik = df_uka_klinik.drop(columns = list(df_patients_columns))

columns = ["seizure_id_Unnamed: 2091_level_1",  "window", 
        "timestamp_start_Unnamed: 2094_level_1"]
df_sandor = df_sandor.drop(columns=columns)
df_ukt_movisens = df_ukt_movisens.drop(columns=columns)
df_freiburg = df_freiburg.drop(columns=columns)
df_uka_corvolution = df_uka_corvolution.drop(columns=columns)
df_uka_klinik = df_uka_klinik.drop(columns=columns)

[creating new connection]


In [44]:
X_sandor = df_sandor[[f"{feature}_0" for feature in features]]
X_ukt_movisens = df_ukt_movisens[[f"{feature}_0" for feature in features]]
X_freiburg = df_freiburg[[f"{feature}_0" for feature in features]]
X_uka_corvolution = df_uka_corvolution[[f"{feature}_0" for feature in features]]
X_uka_klinik = df_uka_klinik[[f"{feature}_0" for feature in features]]

y_sandor = df_sandor["seizure"]
y_ukt_movisens = df_ukt_movisens["seizure"]
y_freiburg = df_freiburg["seizure"]
y_uka_corvolution = df_uka_corvolution["seizure"]
y_uka_klinik = df_uka_klinik["seizure"]

In [46]:
print(X_sandor.shape)
print(X_ukt_movisens.shape)
print(X_freiburg.shape)
print(X_uka_corvolution.shape)
print(X_uka_klinik.shape)

(150, 38)
(450, 38)
(1506, 38)
(352, 38)
(737, 38)


In [47]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_freiburg, y_freiburg, pd.concat([X_sandor, X_ukt_movisens]), pd.concat([y_sandor, y_ukt_movisens]))

print(accuracy)
print(cm)
print(cr)

0.9133333333333333
[[490  10]
 [ 42  58]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       500
           1       0.85      0.58      0.69       100

    accuracy                           0.91       600
   macro avg       0.89      0.78      0.82       600
weighted avg       0.91      0.91      0.91       600



In [48]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_ukt_movisens, y_ukt_movisens, pd.concat([X_sandor, X_freiburg]), pd.concat([y_sandor, y_freiburg]))

print(accuracy)
print(cm)
print(cr)

0.9190821256038647
[[1328   52]
 [  82  194]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1380
           1       0.79      0.70      0.74       276

    accuracy                           0.92      1656
   macro avg       0.87      0.83      0.85      1656
weighted avg       0.92      0.92      0.92      1656



In [49]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_sandor, y_sandor, pd.concat([X_ukt_movisens, X_freiburg]), pd.concat([y_ukt_movisens, y_freiburg]))

print(accuracy)
print(cm)
print(cr)

0.9161554192229039
[[1588   42]
 [ 122  204]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1630
           1       0.83      0.63      0.71       326

    accuracy                           0.92      1956
   macro avg       0.88      0.80      0.83      1956
weighted avg       0.91      0.92      0.91      1956



In [50]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_sandor, y_sandor, pd.concat([X_ukt_movisens, X_freiburg, X_uka_klinik, X_uka_corvolution]), pd.concat([y_ukt_movisens, y_freiburg, y_uka_klinik, y_uka_corvolution]))

print(accuracy)
print(cm)
print(cr)

0.9175697865353037
[[2547   73]
 [ 178  247]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2620
           1       0.77      0.58      0.66       425

    accuracy                           0.92      3045
   macro avg       0.85      0.78      0.81      3045
weighted avg       0.91      0.92      0.91      3045



In [52]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_ukt_movisens, y_ukt_movisens, pd.concat([X_sandor, X_freiburg, X_uka_klinik, X_uka_corvolution]), pd.concat([y_sandor, y_freiburg, y_uka_klinik, y_uka_corvolution]))

print(accuracy)
print(cm)
print(cr)

0.9231329690346084
[[2287   83]
 [ 128  247]]
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2370
           1       0.75      0.66      0.70       375

    accuracy                           0.92      2745
   macro avg       0.85      0.81      0.83      2745
weighted avg       0.92      0.92      0.92      2745



In [53]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_freiburg, y_freiburg, pd.concat([X_ukt_movisens, X_sandor, X_uka_klinik, X_uka_corvolution]), pd.concat([y_ukt_movisens, y_sandor, y_uka_klinik, y_uka_corvolution]))

print(accuracy)
print(cm)
print(cr)

0.9277679100059206
[[1457   33]
 [  89  110]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1490
           1       0.77      0.55      0.64       199

    accuracy                           0.93      1689
   macro avg       0.86      0.77      0.80      1689
weighted avg       0.92      0.93      0.92      1689



In [51]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(pd.concat([X_uka_corvolution, X_uka_klinik]), pd.concat([y_uka_corvolution, y_uka_klinik]), pd.concat([X_ukt_movisens, X_freiburg, X_sandor]), pd.concat([y_ukt_movisens, y_freiburg, y_sandor]))

print(accuracy)
print(cm)
print(cr)

0.9083570750237417
[[1706   49]
 [ 144  207]]
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      1755
           1       0.81      0.59      0.68       351

    accuracy                           0.91      2106
   macro avg       0.87      0.78      0.81      2106
weighted avg       0.90      0.91      0.90      2106

