# Fit and Evaluate XGBoost Model for different data filter

In this experiment we want to fit an XGBoost Model using different datasets on which we have applied different data filters.

In [1]:
import pandas as pd
import seaborn as sns
import shap
from matplotlib import pyplot as plt
import monipy.utils.database as database
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


### 1. Helping Functions

In [16]:
def load_and_prep_data(window=0, data_filter=4):
    # load data
    df = pd.concat([pd.read_csv(f"data/useable_yes/filter_{data_filter}/sandor_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/ukt_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/freiburg_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/uka_corvolution_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/uka_klinik_full.csv", index_col=[0]),
             ])
    
    # rename column
    df = df.rename(columns={"window_Unnamed: 2093_level_1": "window"})
    
    # filter for one starting window
    df = df[df["window"] == window]
    
    # get patient specific columns
    df_patients_columns = database.get_all_patients().columns
    
    # drop patient specific columns and other columns
    df = df.drop(columns = list(df_patients_columns))

    columns = ["seizure_id_Unnamed: 2091_level_1", 
           "record_id", "window", 
           "timestamp_start_Unnamed: 2094_level_1"]
    df = df.drop(columns=columns)

    return df

In [17]:
def train_test_val_split(df):
    # split the data val, test, train
    val_patients = pd.read_csv("data/patients/val_patients_1.csv", index_col=[0]).reset_index(drop=True)
    test_patients = pd.read_csv("data/patients/test_patients_1.csv", index_col=[0]).reset_index(drop=True)
    
    df_val = df[df["patient_id"].isin(val_patients.iloc[:, 0])].reset_index(drop=True)
    df_test = df[df["patient_id"].isin(test_patients.iloc[:, 0])].reset_index(drop=True)
    df_train = df[~(df["patient_id"].isin(set(val_patients.iloc[:, 0]) | set(test_patients.iloc[:, 0])))].reset_index(drop=True)
    return df_val, df_test, df_train

In [18]:
def create_splits(df_train, df_test, df_val, features, time_slice=0):
    data_split = {}
    for split in ["val", "test", "train"]:
        data_split[f"X_{split}"] = eval(f"df_{split}")[[f"{feature}_{time_slice}" for feature in features]]
        data_split[f"y_{split}"] = eval(f"df_{split}")[["seizure"]]
    return data_split["X_train"], data_split["y_train"], data_split["X_test"], data_split["y_test"], data_split["X_val"], data_split["y_val"]

In [19]:
# fit xgboost model
def fit_and_eval(X_train, y_train, X_test, y_test):
    # Define initial hyperparameters
    params = {
        'objective': 'binary:logistic',  # Binary classification objective
        'learning_rate': 0.1,  # Learning rate
        'max_depth': 7, # Maximum depth of each tree
        'n_estimators': 300,
        'subsample': 0.8,  # Subsample ratio of the training instances
        'colsample_bytree': 0.8,  # Subsample ratio of features when constructing each tree
        'scale_pos_weight': 0.9,
    }

    # Create the XGBoost classifier
    model = XGBClassifier(**params)

    # Train the XGBoost model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # return evaluation
    return model, (accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred), classification_report(y_test, y_pred),recall_score(y_test, y_pred),recall_score(y_test, y_pred, pos_label=0))

In [20]:
features = {'avg',
 'csi',
 'csi_filtered',
 'csi_filtered_slope',
 'csi_slope',
 'csim',
 'csim_filtered',
 'csim_filtered_slope',
 'csim_slope',
 'cvi',
 'hf',
 'hr_diff',
 'hr_diff_filtered',
 'hr_diff_filtered_slope',
 'hr_diff_slope',
 'kurt',
 'lf',
 'lf_hf_ratio',
 'mf_coef_center',
 'mf_coef_left',
 'mf_coef_right',
 'mf_hurst_max',
 'nnx',
 'pnnx',
 'quantile_25',
 'quantile_50',
 'quantile_75',
 'rmssd',
 'rmssd_dt',
 'sd',
 'sd1',
 'sd2',
 'skew',
 'total_power',
 'triangular_index',
 'ulf',
 'variance',
 'vlf'}

### 2. Load Data

In [21]:
df = load_and_prep_data()
print(df.shape)
df_val, df_test, df_train = train_test_val_split(df)

[creating new connection]
(3195, 2092)


### 3. Test model performance for different data filter

In [23]:
performance = []
for i in range(1,5,1):
    df = load_and_prep_data(data_filter=i)
    df_val, df_test, df_train = train_test_val_split(df)
    X_train, y_train, X_test, y_test, X_val, y_val = create_splits(df_train, df_test, df_val, features, 0)
    model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_train, y_train, X_test, y_test)
    
    performance.append({
        "filter": i,
        "accuracy": accuracy,
        "recall_1": recall_1,
        "recall_0": recall_0,
        "cm": cm,
        "cr": cr,
    })

[creating new connection]


Columns (2105) have mixed types. Specify dtype option on import or set low_memory=False.
Columns (2105) have mixed types. Specify dtype option on import or set low_memory=False.


[creating new connection]


Columns (2105) have mixed types. Specify dtype option on import or set low_memory=False.
Columns (2105) have mixed types. Specify dtype option on import or set low_memory=False.


[creating new connection]
[creating new connection]


In [24]:
df_performance = pd.DataFrame(performance)

In [25]:
df_performance 

Unnamed: 0,filter,accuracy,recall_1,recall_0,cm,cr
0,1,0.958124,0.727273,0.992308,"[[516, 4], [21, 56]]",precision recall f1-score ...
1,2,0.919105,0.568421,0.987654,"[[480, 6], [41, 54]]",precision recall f1-score ...
2,3,0.935433,0.635294,0.981818,"[[540, 10], [31, 54]]",precision recall f1-score ...
3,4,0.945274,0.817391,0.996516,"[[286, 1], [21, 94]]",precision recall f1-score ...


In [26]:
print(df_performance.loc[0, "cr"]) # filter 1

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       520
           1       0.93      0.73      0.82        77

    accuracy                           0.96       597
   macro avg       0.95      0.86      0.90       597
weighted avg       0.96      0.96      0.96       597



In [27]:
print(df_performance.loc[3, "cr"]) # filter 4

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       287
           1       0.99      0.82      0.90       115

    accuracy                           0.95       402
   macro avg       0.96      0.91      0.93       402
weighted avg       0.95      0.95      0.94       402

