# Fit and Evaluate XGBoost Model to achieve best possibel performance

In this experiment we want to bring all the knowledge that we gained in the previous experiments together to fit a model and achieve the best possible performance.

In [30]:
import pandas as pd 
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
import shap

### 1. Helping Functions

In [31]:
def train_test_val_split(df):
    # split the data val, test, train
    val_patients = pd.read_csv("data/patients/val_patients_1.csv", index_col=[0]).reset_index(drop=True)
    test_patients = pd.read_csv("data/patients/test_patients_1.csv", index_col=[0]).reset_index(drop=True)
    
    df_val = df[df["patient_id"].isin(val_patients.iloc[:, 0])].reset_index(drop=True)
    df_test = df[df["patient_id"].isin(test_patients.iloc[:, 0])].reset_index(drop=True)
    df_train = df[~(df["patient_id"].isin(set(val_patients.iloc[:, 0]) | set(test_patients.iloc[:, 0])))].reset_index(drop=True)
    return df_val, df_test, df_train

In [32]:
def create_splits(df_train, df_test, df_val, features, patient_features=[], time_slice=0):
    data_split = {}
    for split in ["val", "test", "train"]:
        
        data_split[f"X_{split}"] = eval(f"df_{split}")[[f"{feature}_{time_slice}" for feature in features] + patient_features]
        data_split[f"y_{split}"] = eval(f"df_{split}")[["seizure"]]
    return data_split["X_train"], data_split["y_train"], data_split["X_test"], data_split["y_test"], data_split["X_val"], data_split["y_val"]

In [33]:
def get_best_params(X_train, y_train):
    model = XGBClassifier()

    # Define the hyperparameter grid for grid search
    param_grid = {
        'objective': ['binary:logistic'],
        'n_estimators': [200, 300, 400],
        'learning_rate': [0.5, 0.1, 0.01],
        'max_depth': [3, 5, 7],
        'scale_pos_weight':[0.1, 0.2, 0.9, 0.5],
    }

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

    # Fit the data to perform grid search
    grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

    # Print the best hyperparameters and the corresponding score
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Score: ", grid_search.best_score_)

In [53]:
# fit xgboost model
def fit_and_eval(X_train, y_train, X_test, y_test):
    # Define initial hyperparameters
    params = {
        'objective': 'binary:logistic',  # Binary classification objective
        'learning_rate': 0.1,  # Learning rate
        'max_depth': 7, # Maximum depth of each tree
        'n_estimators': 300,
        #'subsample': 0.8,  # Subsample ratio of the training instances
        'colsample_bytree': 0.8,  # Subsample ratio of features when constructing each tree
        'scale_pos_weight': 0.9,
    }

    # Create the XGBoost classifier
    model = XGBClassifier(**params)

    # Train the XGBoost model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # return evaluation
    return model, (accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred), classification_report(y_test, y_pred),recall_score(y_test, y_pred),recall_score(y_test, y_pred, pos_label=0))

In [157]:
def predict_with_threshold(model, X_test, threshold):
    y_pred_proba = model.predict_proba(X_test)
    prediction = pd.DataFrame(y_pred_proba)
    prediction["label"] = prediction.apply(lambda x: 1 if x[1] >= threshold else 0, axis=1)

    return prediction.label

In [35]:
features = {'avg',
 'csi',
 'csi_filtered',
 'csi_filtered_slope',
 'csi_slope',
 'csim',
 'csim_filtered',
 'csim_filtered_slope',
 'csim_slope',
 'cvi',
 'hf',
 'hr_diff',
 'hr_diff_filtered',
 'hr_diff_filtered_slope',
 'hr_diff_slope',
 'kurt',
 'lf',
 'lf_hf_ratio',
 'mf_coef_center',
 'mf_coef_left',
 'mf_coef_right',
 'mf_hurst_max',
 'nnx',
 'pnnx',
 'quantile_25',
 'quantile_50',
 'quantile_75',
 'rmssd',
 'rmssd_dt',
 'sd',
 'sd1',
 'sd2',
 'skew',
 'total_power',
 'triangular_index',
 'ulf',
 'variance',
 'vlf'
 }

## 2. Modelling using patient data

### 2.1 Using Age and Sex

In [73]:
# load data
df = pd.read_csv("prep_patient_data.csv", index_col=[0])

# filter out birthday of "1900-01-01"
df = df[~(df["birthday"] == "1900-01-01")]

# rename column
df = df.rename(columns={"window_Unnamed: 2093_level_1": "window"})
    
# filter for a starting window of zero
df = df[df["window"] == -18]

# get dummy variables for sex
df = pd.concat([pd.get_dummies(df.sex), df], axis=1).drop(columns="sex")


Columns (2097,2099,2100,2105) have mixed types. Specify dtype option on import or set low_memory=False.


In [74]:
# train test val split (only considere first timeslice)
df_val, df_test, df_train = train_test_val_split(df)

In [75]:
x = []
for i in range(14,29,1):
    x = x + [f"{feature}_{i}" for feature in features]

x += ["age", "f", "m", "u"]

In [76]:
X_train = df_train[x]
y_train = df_train.seizure

X_test = df_test[x]
y_test = df_test.seizure

X_val = df_val[x]
y_val = df_val.seizure

In [77]:
# get best model parameters
#get_best_params(X_train, y_train)

In [78]:
y_test.value_counts()

seizure
0    287
1    115
Name: count, dtype: int64

In [186]:
# fit model
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_train, y_train, pd.concat([X_test, X_val]), pd.concat([y_test, y_val]))

In [187]:
cm

array([[637,   4],
       [ 49, 145]])

In [188]:
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.99      0.96       641
           1       0.97      0.75      0.85       194

    accuracy                           0.94       835
   macro avg       0.95      0.87      0.90       835
weighted avg       0.94      0.94      0.93       835



##### Try a different prediction threshold

In [189]:
y_pred_2 = predict_with_threshold(model, pd.concat([X_test, X_val]), 0.007)

In [190]:
confusion_matrix(pd.concat([y_test, y_val]), y_pred_2)

array([[599,  42],
       [ 12, 182]])

In [191]:
print(classification_report(pd.concat([y_test, y_val]), y_pred_2))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96       641
           1       0.81      0.94      0.87       194

    accuracy                           0.94       835
   macro avg       0.90      0.94      0.91       835
weighted avg       0.94      0.94      0.94       835



### 2.5 Agerange instead of age as feature

In [97]:
# load data
df = pd.read_csv("prep_patient_data.csv", index_col=[0])

# filter out birthday of "1900-01-01"
df = df[~(df["birthday"] == "1900-01-01")]

# rename column
df = df.rename(columns={"window_Unnamed: 2093_level_1": "window"})
    
# filter for a starting window of zero
df = df[df["window"] == -18]
    
age_ranges = list(range(0, 101, 5))

df['agerange'] = pd.cut(df.age, age_ranges,labels=False, include_lowest = True)
# get dummy variables for sex
df = pd.concat([pd.get_dummies(df.sex), df], axis=1).drop(columns="sex")

Columns (2097,2099,2100,2105) have mixed types. Specify dtype option on import or set low_memory=False.


In [98]:
x = []
for i in range(14,29,1):
    x = x + [f"{feature}_{i}" for feature in features]

x += ["age", "f", "m", "u"]

In [99]:
X_train = df_train[x]
y_train = df_train.seizure

X_test = df_test[x]
y_test = df_test.seizure

X_val = df_val[x]
y_val = df_val.seizure

In [192]:
# fit model
model, (accuracy, cm, cr, recall_1, recall_0) = fit_and_eval(X_train, y_train, pd.concat([X_test, X_val]), pd.concat([y_test, y_val]))

In [193]:
cm

array([[637,   4],
       [ 49, 145]])

In [194]:
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.99      0.96       641
           1       0.97      0.75      0.85       194

    accuracy                           0.94       835
   macro avg       0.95      0.87      0.90       835
weighted avg       0.94      0.94      0.93       835



##### Try a different prediction threshold

In [204]:
y_pred_2 = predict_with_threshold(model, pd.concat([X_test, X_val]), 0.007)

In [205]:
confusion_matrix(pd.concat([y_test, y_val]), y_pred_2)

array([[599,  42],
       [ 12, 182]])

In [206]:
print(classification_report(pd.concat([y_test, y_val]), y_pred_2))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96       641
           1       0.81      0.94      0.87       194

    accuracy                           0.94       835
   macro avg       0.90      0.94      0.91       835
weighted avg       0.94      0.94      0.94       835

