# XGBoost Grid Search + Training

From our paper on "Explainable Prediction of Acute Myocardial Infarction using Machine Learning and Shapley Values"

In [1]:
# Import libraries
import pandas as pd
import numpy as np
# !pip install xgboost
import xgboost
import time
import pickle
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Loading Data

In [2]:
# Load data
# Import train and test data into dataframes from csv files produced using the data processing code
df_train = pd.read_csv("train.csv", header=None)
df_train = df_train.sample(frac=1)
df_test = pd.read_csv("test.csv", header=None)

In [3]:
# Get data from dataframes
train_y = np.array(df_train[11].values).astype(np.int8)
train_x = np.array(df_train[list(range(11))].values)
test_y = np.array(df_test[11].values).astype(np.int8)
test_x = np.array(df_test[list(range(11))].values)

In [4]:
# Check the MI distribution in the training set
unique, counts = np.unique(train_y, return_counts=True)
dict(zip(unique, counts))

{0: 275553, 1: 137748}

In [5]:
# Check the MI distribution in the testing set
unique, counts = np.unique(test_y, return_counts=True)
dict(zip(unique, counts))

{0: 68865, 1: 34461}

## Grid search for model optimization

In [8]:
# Grid Search to optimize the model 
parameters = {'booster':('gbtree', 'gblinear', 'dart'), 'learning_rate':[0.01, 0.1, 1], 'n_estimators':[10,50,100]}

XGB = XGBClassifier(random_state=0)
clf = GridSearchCV(XGB, parameters)
clf.fit(train_x, train_y)
sorted(clf.cv_results_.keys())
print(clf.cv_results_)

{'mean_fit_time': array([  3.81230502,  17.64656587,  34.78991694,   3.93534751,
        18.19835405,  34.41472707,   3.97846346,  17.3065372 ,
        36.62668157,   1.11416049,   3.58786511,   6.5538332 ,
         1.08523765,   3.42513351,   6.05989079,   1.07127919,
         3.26071095,   6.10720263,   4.55516753,  52.07321897,
       107.62512918,   4.532336  ,  44.13014774, 107.15109601,
         4.87478499,  34.82766013, 105.33576269]), 'std_fit_time': array([1.83990431e-01, 2.93281273e-01, 5.53717809e-01, 5.13285522e-02,
       4.01229314e-01, 8.66749380e-01, 6.17965833e-02, 2.11800998e-01,
       9.30618414e-01, 2.59773304e-02, 7.48883704e-02, 9.73116038e-02,
       4.40765446e-02, 2.88515801e-01, 5.80355354e-02, 2.00449781e-02,
       6.31460499e-02, 1.02126040e-01, 1.64993306e-02, 3.28945629e+01,
       3.29363494e+00, 1.25508262e-01, 1.68903778e+01, 1.87152630e+00,
       2.85173732e-01, 9.98194392e-01, 1.85065396e+00]), 'mean_score_time': array([0.06700416, 0.10015106, 0.13

In [8]:
# Obtain the parameters for the best model 
clf.best_estimator_

## Model Training 

In [9]:
# Train the XGBoost model with the optimal parameters 
model = XGBClassifier(learning_rate = 1)
model.fit(train_x, train_y)

In [10]:
# Save model to file
pickle.dump(model, open("xgboost_ecgview.model", "wb"))