 # Fit XGBoost model and evaluate on healthy dataset

In this experiment we want to fit an XGBoost Model on data which only consists of recordings of epilepsy patients and then evaluate the model performance on a data set which only consists of recordings of healthy people in their daily life.

In [2]:
import pandas as pd
import seaborn as sns
import shap
from matplotlib import pyplot as plt
import monipy.utils.database as database
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### 1. Helping Functions

In [3]:
def load_and_prep_data(window=0, data_filter=4):
    # load data
    df = pd.concat([pd.read_csv(f"data/useable_yes_unknown/filter_{data_filter}/sandor_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/ukt_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/freiburg_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/uka_corvolution_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/uka_klinik_full.csv", index_col=[0]),
             ])
    
    # rename column
    df = df.rename(columns={"window_Unnamed: 2093_level_1": "window"})
    
    # filter for one starting window
    df = df[df["window"] == window]
    
    # get patient specific columns
    df_patients_columns = database.get_all_patients().columns
    
    # drop patient specific columns and other columns
    df = df.drop(columns = list(df_patients_columns))

    columns = ["seizure_id_Unnamed: 2091_level_1", 
           "record_id", "window", 
           "timestamp_start_Unnamed: 2094_level_1"]
    df = df.drop(columns=columns)

    return df

In [4]:
def train_test_val_split(df):
    # split the data val, test, train
    val_patients = pd.read_csv("data/patients/val_patients_1.csv", index_col=[0]).reset_index(drop=True)
    test_patients = pd.read_csv("data/patients/test_patients_1.csv", index_col=[0]).reset_index(drop=True)
    
    df_val = df[df["patient_id"].isin(val_patients.iloc[:, 0])].reset_index(drop=True)
    df_test = df[df["patient_id"].isin(test_patients.iloc[:, 0])].reset_index(drop=True)
    df_train = df[~(df["patient_id"].isin(set(val_patients.iloc[:, 0]) | set(test_patients.iloc[:, 0])))].reset_index(drop=True)
    return df_val, df_test, df_train

In [5]:
def create_splits(df_train, df_test, df_val, features, time_slice=0):
    data_split = {}
    for split in ["val", "test", "train"]:
        data_split[f"X_{split}"] = eval(f"df_{split}")[[f"{feature}_{time_slice}" for feature in features]]
        data_split[f"y_{split}"] = eval(f"df_{split}")[["seizure"]]
    return data_split["X_train"], data_split["y_train"], data_split["X_test"], data_split["y_test"], data_split["X_val"], data_split["y_val"]

In [6]:
# fit xgboost model
def fit_and_eval(X_train, y_train, X_test, y_test):
    # Define initial hyperparameters
    params = {
        'objective': 'binary:logistic',  # Binary classification objective
        'learning_rate': 0.1,  # Learning rate
        'max_depth': 7, # Maximum depth of each tree
        'n_estimators': 300,
        'subsample': 0.8,  # Subsample ratio of the training instances
        'colsample_bytree': 0.8,  # Subsample ratio of features when constructing each tree
        'scale_pos_weight': 0.9,
    }

    # Create the XGBoost classifier
    model = XGBClassifier(**params)

    # Train the XGBoost model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # return evaluation
    return model, (accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred), classification_report(y_test, y_pred),recall_score(y_test, y_pred),recall_score(y_test, y_pred, pos_label=0))

In [7]:
features = {'avg',
 'csi',
 'csi_filtered',
 'csi_filtered_slope',
 'csi_slope',
 'csim',
 'csim_filtered',
 'csim_filtered_slope',
 'csim_slope',
 'cvi',
 'hf',
 'hr_diff',
 'hr_diff_filtered',
 'hr_diff_filtered_slope',
 'hr_diff_slope',
 'kurt',
 'lf',
 'lf_hf_ratio',
 'mf_coef_center',
 'mf_coef_left',
 'mf_coef_right',
 'mf_hurst_max',
 'nnx',
 'pnnx',
 'quantile_25',
 'quantile_50',
 'quantile_75',
 'rmssd',
 'rmssd_dt',
 'sd',
 'sd1',
 'sd2',
 'skew',
 'total_power',
 'triangular_index',
 'ulf',
 'variance',
 'vlf'}

### 2. Load Data

In [8]:
df = load_and_prep_data()
print(df.shape)
df_val, df_test, df_train = train_test_val_split(df)

[creating new connection]
(2106, 2092)


### 3. Fit and Evaluate XGBoost model

In [9]:
x = []
for i in range(0,5,1):
    x = x + [f"{feature}_{i}" for feature in features]

In [10]:
X_train = df_train[x]
y_train = df_train.seizure

X_test = df_test[x]
y_test = df_test.seizure

X_val = df_val[x]
y_val = df_val.seizure

In [11]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_train, y_train, X_test, y_test)

In [12]:
# performance on test set
display(accuracy)
display(cm)
print(cr)

0.9673202614379085

array([[266,   3],
       [  7,  30]])

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       269
           1       0.91      0.81      0.86        37

    accuracy                           0.97       306
   macro avg       0.94      0.90      0.92       306
weighted avg       0.97      0.97      0.97       306



In [13]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_train, y_train, pd.concat([X_test, X_val]), pd.concat([y_test, y_val]))

In [14]:
# performance on test set
display(accuracy)
display(cm)
print(cr)

0.937592867756315

array([[528,   9],
       [ 33, 103]])

              precision    recall  f1-score   support

           0       0.94      0.98      0.96       537
           1       0.92      0.76      0.83       136

    accuracy                           0.94       673
   macro avg       0.93      0.87      0.90       673
weighted avg       0.94      0.94      0.94       673



In [15]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval( pd.concat([X_train, X_test, X_val]),  pd.concat([y_train, y_test, y_val]), pd.concat([X_test, X_val]), pd.concat([y_test, y_val]))

In [17]:


# read data
df_alltag_corvolution = pd.read_csv(f"data/useable_yes/alltag/alltag_corvolution_regular_data.csv", index_col=[0], header=[0,1])
df_alltag_movisens = pd.read_csv(f"data/useable_yes/alltag/alltag_movisens_regular_data.csv", index_col=[0], header=[0,1])


df = pd.concat([df_alltag_corvolution, df_alltag_movisens])
df.columns = df.columns.map("_".join)

# rename column
df = df.rename(columns={"record_id_Unnamed: 2092_level_1": "record_id"})

# add label column
df["seizure"] = 0

# rename column
df = df.rename(columns={"window_Unnamed: 2093_level_1": "window"})


columns = ["seizure_id_Unnamed: 2091_level_1", 
        "record_id", "window", 
        "timestamp_start_Unnamed: 2094_level_1"]
df = df.drop(columns=columns)

In [18]:
X = df[x]
y = df["seizure"]

In [19]:
# Predict on the test set
y_pred = model.predict(X)

# return evaluation
print(accuracy_score(y, y_pred))
print(confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))

0.948
[[1896  104]
 [   0    0]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.97      2000
           1       0.00      0.00      0.00         0

    accuracy                           0.95      2000
   macro avg       0.50      0.47      0.49      2000
weighted avg       1.00      0.95      0.97      2000



Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
