# Fit and Evaluate XGBoost Model (GridSearch Parameters)

In this experiment we want to fit an XGBoost Model only using the first time slice. 

In the first step of this experiement we want to find the best parameters that yield the best results using GridSearch.

In [7]:
import pandas as pd
import seaborn as sns
import shap
from matplotlib import pyplot as plt
import monipy.utils.database as database
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


### 1. Helping Functions

In [8]:
def load_and_prep_data(window=0, data_filter=4):
    # load data
    df = pd.concat([pd.read_csv(f"data/useable_yes_unknown/filter_{data_filter}/sandor_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/ukt_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/freiburg_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/uka_corvolution_full.csv", index_col=[0]),
              pd.read_csv(f"data/useable_yes/filter_{data_filter}/uka_klinik_full.csv", index_col=[0]),
             ])
    
    # rename column
    df = df.rename(columns={"window_Unnamed: 2093_level_1": "window"})
    
    # filter for one starting window
    df = df[df["window"] == window]
    
    # get patient specific columns
    df_patients_columns = database.get_all_patients().columns
    
    # drop patient specific columns and other columns
    df = df.drop(columns = list(df_patients_columns))

    columns = ["seizure_id_Unnamed: 2091_level_1", 
           "record_id", "window", 
           "timestamp_start_Unnamed: 2094_level_1"]
    df = df.drop(columns=columns)

    return df

In [45]:
def train_test_val_split(df):
    # split the data val, test, train
    val_patients = pd.read_csv("data/patients/val_patients_1.csv", index_col=[0]).reset_index(drop=True)
    test_patients = pd.read_csv("data/patients/test_patients_1.csv", index_col=[0]).reset_index(drop=True)
    
    df_val = df[df["patient_id"].isin(val_patients.iloc[:, 0])].reset_index(drop=True)
    df_test = df[df["patient_id"].isin(test_patients.iloc[:, 0])].reset_index(drop=True)
    df_train = df[~(df["patient_id"].isin(set(val_patients.iloc[:, 0]) | set(test_patients.iloc[:, 0])))].reset_index(drop=True)
    return df_val, df_test, df_train

In [10]:
def create_splits(df_train, df_test, df_val, features, time_slice=0):
    data_split = {}
    for split in ["val", "test", "train"]:
        data_split[f"X_{split}"] = eval(f"df_{split}")[[f"{feature}_{time_slice}" for feature in features]]
        data_split[f"y_{split}"] = eval(f"df_{split}")[["seizure"]]
    return data_split["X_train"], data_split["y_train"], data_split["X_test"], data_split["y_test"], data_split["X_val"], data_split["y_val"]

In [38]:
# fit xgboost model
def fit_and_eval(X_train, y_train, X_test, y_test):
    # Define initial hyperparameters
    params = {
        'objective': 'binary:logistic',  # Binary classification objective
        'learning_rate': 0.1, # 0.1  # Learning rate
        'max_depth': 7,# 7 # Maximum depth of each tree
        'n_estimators': 300,
        'subsample': 0.8,  # Subsample ratio of the training instances
        'colsample_bytree': 0.8,  # Subsample ratio of features when constructing each tree
        'scale_pos_weight': 0.9, # 0.9
    }

    # Create the XGBoost classifier
    model = XGBClassifier(**params)

    # Train the XGBoost model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # return evaluation
    return model, (accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred), classification_report(y_test, y_pred),recall_score(y_test, y_pred),recall_score(y_test, y_pred, pos_label=0))

In [27]:
features = {'avg',
 'csi',
 'csi_filtered',
 'csi_filtered_slope',
 'csi_slope',
 'csim',
 'csim_filtered',
 'csim_filtered_slope',
 'csim_slope',
 'cvi',
 'hf',
 'hr_diff',
 'hr_diff_filtered',
 'hr_diff_filtered_slope',
 'hr_diff_slope',
 'kurt',
 'lf',
 'lf_hf_ratio',
 'mf_coef_center',
 'mf_coef_left',
 'mf_coef_right',
 'mf_hurst_max',
 'nnx',
 'pnnx',
 'quantile_25',
 'quantile_50',
 'quantile_75',
 'rmssd',
 'rmssd_dt',
 'sd',
 'sd1',
 'sd2',
 'skew',
 'total_power',
 'triangular_index',
 'ulf',
 'variance',
 'vlf'}

### 2. Load Data

In [46]:
df = load_and_prep_data()
print(df.shape)
df_val, df_test, df_train = train_test_val_split(df)

[creating new connection]
(3320, 2092)


### 3. GridSearch

In [20]:
def get_best_params(X_train, y_train):
    model = XGBClassifier()

    # Define the hyperparameter grid for grid search
    param_grid = {
        'objective': ['binary:logistic'],
        'n_estimators': [200, 300, 400],
        'learning_rate': [0.5, 0.1, 0.01],
        'max_depth': [3, 5, 7],
        'scale_pos_weight':[0.1, 0.2, 0.9, 0.5],
    }

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

    # Fit the data to perform grid search
    grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

    # Print the best hyperparameters and the corresponding score
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Score: ", grid_search.best_score_)

In [21]:
X_train, y_train, X_test, y_test, X_val, y_val = create_splits(df_train, df_test, df_val, features, 0)
get_best_params(X_val, y_val)

Best Hyperparameters:  {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 200, 'objective': 'binary:logistic', 'scale_pos_weight': 0.2}
Best Score:  0.9514792482104446


### 4. Fit and Evaluate XGBoost model

In [47]:
X_train, y_train, X_test, y_test, X_val, y_val = create_splits(df_train, df_test, df_val, features, 0)

In [48]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_train, y_train , pd.concat([X_val, X_test]), pd.concat([y_val, y_test]))

In [49]:
# performance on test set
display(accuracy)
display(cm)
print(cr)

0.9293478260869565

array([[713,   9],
       [ 56, 142]])

              precision    recall  f1-score   support

           0       0.93      0.99      0.96       722
           1       0.94      0.72      0.81       198

    accuracy                           0.93       920
   macro avg       0.93      0.85      0.89       920
weighted avg       0.93      0.93      0.93       920



In [230]:
# performance on test set
display(accuracy)
display(cm)
print(cr)

0.9343065693430657

array([[293,   3],
       [ 24,  91]])

              precision    recall  f1-score   support

           0       0.92      0.99      0.96       296
           1       0.97      0.79      0.87       115

    accuracy                           0.93       411
   macro avg       0.95      0.89      0.91       411
weighted avg       0.94      0.93      0.93       411



In [231]:
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), X_val, y_val)

In [232]:
# performance on test set
display(accuracy)
display(cm)
print(cr)

0.9410609037328095

array([[419,   7],
       [ 23,  60]])

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       426
           1       0.90      0.72      0.80        83

    accuracy                           0.94       509
   macro avg       0.92      0.85      0.88       509
weighted avg       0.94      0.94      0.94       509



In [233]:
# performance on train set
y_pred_train = model.predict(X_train)
print(accuracy_score(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

1.0
[[2148    0]
 [   0  252]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2148
           1       1.00      1.00      1.00       252

    accuracy                           1.00      2400
   macro avg       1.00      1.00      1.00      2400
weighted avg       1.00      1.00      1.00      2400

