# XGBoost

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import recall_score
from sklearn.model_selection import RandomizedSearchCV
from timeit import default_timer
import datetime
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from sklearn.model_selection import PredefinedSplit
#from sklearn.model_selection import cross_val_score




print(f"Notebook last run (end-to-end): {datetime.datetime.now()}")

Notebook last run (end-to-end): 2024-03-10 10:21:28.702975


In [2]:
SEED_ORIG = 2024
SEED = SEED_ORIG
NUM_SIMS = 10

# Hyperparameter tuning with SMOTE and Hyperopt

In [3]:
df = pd.read_csv("../data/HTRU_2.csv",sep=",",header=None)
labels = df[8]
df = df.drop([8],axis=1)
df["Labels"] = labels
df["Labels"].value_counts()

0    16259
1     1639
Name: Labels, dtype: int64

In [4]:
X_train_, X_test = train_test_split(df, test_size=0.1, random_state=SEED_ORIG) # Splitting the test set
y_train_ = X_train_["Labels"]
X_train_ = X_train_.drop("Labels",axis=1)

In [5]:
sm = SMOTE(random_state=SEED_ORIG)
X_resampled,y_resampled= sm.fit_resample(X_train_, y_train_)
len(X_resampled), len(y_resampled)

(29282, 29282)

In [6]:
X_resampled["Labels"] = y_resampled

In [7]:
X_train, X_valid = train_test_split(X_resampled, test_size=0.1, random_state=SEED_ORIG) 
cols = X_train.columns[:-1]


In [8]:
len(X_train), len(X_valid), len(X_test)


(26353, 2929, 1790)

In [9]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,Labels
23647,88.252228,44.740535,1.334553,2.606667,8.658360,38.102912,4.774594,22.500578,1
15502,146.093750,46.782148,-0.182414,0.083016,1.823579,18.376583,10.816643,120.463003,0
18633,35.182888,37.299263,3.901432,16.401026,60.909358,69.969228,1.218334,0.519661,1
15513,114.156250,41.541289,0.577129,1.516991,1.039298,12.752767,14.106027,217.465501,0
25596,19.055439,35.070892,5.206158,27.650399,122.016522,74.897721,-0.105256,-1.007368,1
...,...,...,...,...,...,...,...,...,...
14875,126.125000,42.971284,-0.059258,0.265109,2.623746,19.325817,8.561876,80.169221,0
2688,113.398438,40.149397,0.322763,0.532786,2.985786,16.622323,8.176916,83.887394,0
19040,69.889086,32.188954,3.004694,14.108755,16.239157,47.742230,3.309714,10.573754,1
19962,65.243681,38.455560,2.207688,7.442995,37.377171,68.128617,1.624815,1.260027,1


In [10]:
X_train["Labels"].value_counts(), X_valid["Labels"].value_counts(), X_test["Labels"].value_counts()

(1    13189
 0    13164
 Name: Labels, dtype: int64,
 0    1477
 1    1452
 Name: Labels, dtype: int64,
 0    1618
 1     172
 Name: Labels, dtype: int64)

In [11]:
y_train = X_train["Labels"]
y_valid = X_valid["Labels"]
y_test = X_test["Labels"]
X_train = X_train.drop("Labels",axis=1)
X_valid = X_valid.drop("Labels",axis=1)
X_test = X_test.drop("Labels",axis=1)


In [12]:
len(X_train), len(X_valid)

(26353, 2929)

In [13]:
X_train_ = pd.concat([X_train,X_valid],axis=0,ignore_index=True)

In [14]:
y_train_ = pd.concat([y_train,y_valid],axis=0,ignore_index=True)

In [15]:
len(y_train_)

29282

In [16]:
# Creating the validation fold
valid_fold = np.concatenate([
    -np.ones(len(X_train)), # Training set
    np.zeros(len(X_valid)) # Validation set
])

In [17]:
valid_fold[:10], valid_fold[-10:]

(array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [18]:
ps = PredefinedSplit(test_fold=valid_fold)

In [19]:
#!pip install hyperopt

In [20]:
# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,10)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100])
}

In [21]:
# Objective function
def objective(params):

    xgboost = XGBClassifier(seed=SEED, **params)
    """
    score = cross_val_score(estimator=xgboost,
                            X=X_train_,
                            y=y_train_,
                            cv=ps,
                            scoring='f1',
                            n_jobs=-1).mean()
    """
    xgboost.fit(X_train,y_train)
    y_pred = xgboost.predict(X_valid)
    score = f1_score(y_valid,y_pred)
    #score = recall_score(y_valid, y_pred)

    # Loss is negative score
    loss = - score

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [22]:
best_models = []
times_train = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    start_time = default_timer()
    SEED = SEED_ORIG + i
    best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 10, trials = Trials())
    best_models.append(best)
    end_time = default_timer()
    times_train.append(end_time-start_time)
    print(f"Training time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
100%|██████████| 10/10 [02:10<00:00, 13.06s/trial, best loss: -0.9749742886527255]
Training time: 130.59s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
100%|██████████| 10/10 [02:31<00:00, 15.18s/trial, best loss: -0.9763293310463123]
Training time: 151.78s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
100%|██████████| 10/10 [02:02

In [23]:
# best hyperparameters
best

{'colsample_bytree': 1,
 'gamma': 7,
 'learning_rate': 3,
 'max_depth': 2,
 'reg_alpha': 3,
 'reg_lambda': 0}

In [24]:
results_valid = []
results_test = []
times_eval = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    
    SEED = SEED_ORIG + i
    
    # Train model using the best parameters
    xgboost_bo = XGBClassifier(seed=SEED,
                           colsample_bytree=space_eval(space, best_models[i])['colsample_bytree'],
                           gamma=space_eval(space, best_models[i])['gamma'],
                           learning_rate=space_eval(space, best_models[i])['learning_rate'],
                           max_depth=space_eval(space, best_models[i])['max_depth'],
                           reg_alpha=space_eval(space, best_models[i])['reg_alpha'],
                           reg_lambda=space_eval(space, best_models[i])['reg_lambda']
                           ).fit(X_train,y_train)
    
    start_time = default_timer()
    y_pred = xgboost_bo.predict(X_valid)
    recall = recall_score(y_valid, y_pred)
    f1score = f1_score(y_valid,y_pred)
    mcc = matthews_corrcoef(y_valid, y_pred)
    acc = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_valid.append(results_temp)
    
    y_pred = xgboost_bo.predict(X_test)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test,y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_test.append(results_temp)
    
    end_time = default_timer()
    times_eval.append(end_time-start_time)
    print(f"Evaluation time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
Evaluation time: 0.04s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
Evaluation time: 0.03s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
Evaluation time: 0.05s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 3


In [25]:
results_test

[{'recall': 0.9069767441860465,
  'f1score': 0.8342245989304812,
  'mcc': 0.8183090279230709,
  'accuracy': 0.9653631284916201,
  'precision': 0.8811008304860712},
 {'recall': 0.9127906976744186,
  'f1score': 0.8509485094850948,
  'mcc': 0.836294234697709,
  'accuracy': 0.9692737430167597,
  'precision': 0.8937690594319692},
 {'recall': 0.9011627906976745,
  'f1score': 0.842391304347826,
  'mcc': 0.8266041224936218,
  'accuracy': 0.9675977653631285,
  'precision': 0.890075666402069},
 {'recall': 0.9302325581395349,
  'f1score': 0.862533692722372,
  'mcc': 0.8495359324648414,
  'accuracy': 0.9715083798882681,
  'precision': 0.8982388371777177},
 {'recall': 0.9186046511627907,
  'f1score': 0.8272251308900525,
  'mcc': 0.8118531433928994,
  'accuracy': 0.9631284916201117,
  'precision': 0.8717600964436407},
 {'recall': 0.9127906976744186,
  'f1score': 0.8395721925133689,
  'mcc': 0.8243000182050482,
  'accuracy': 0.9664804469273743,
  'precision': 0.8838909394717809},
 {'recall': 0.912790

In [26]:
times_train

[130.59130655899935,
 151.78231732700078,
 122.5096749150016,
 156.9746402460005,
 96.94165907000024,
 120.19686458400065,
 115.87029629200151,
 135.239939563,
 103.46271597899977,
 121.88677141500011]

In [27]:
results_valid

[{'recall': 0.9793388429752066,
  'f1score': 0.9749742886527255,
  'mcc': 0.950191217787879,
  'accuracy': 0.9750768180266303,
  'precision': 0.975078330442567},
 {'recall': 0.9800275482093664,
  'f1score': 0.9763293310463123,
  'mcc': 0.9529114126143554,
  'accuracy': 0.9764424718333903,
  'precision': 0.976438600532088},
 {'recall': 0.9848484848484849,
  'f1score': 0.9781121751025993,
  'mcc': 0.9563901685574581,
  'accuracy': 0.9781495390918402,
  'precision': 0.9781839358718032},
 {'recall': 0.9848484848484849,
  'f1score': 0.9797875984926345,
  'mcc': 0.9597641424467812,
  'accuracy': 0.9798566063502901,
  'precision': 0.9798652899120366},
 {'recall': 0.9731404958677686,
  'f1score': 0.9718019257221459,
  'mcc': 0.9440089415167958,
  'accuracy': 0.9720040969614203,
  'precision': 0.9719952272778138},
 {'recall': 0.9841597796143251,
  'f1score': 0.9744289123764065,
  'mcc': 0.9489789450747611,
  'accuracy': 0.9743939911232502,
  'precision': 0.9745023054454023},
 {'recall': 0.97727

In [28]:
# validation f1-scores
values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]

In [29]:
# the best f1-score model
ind = np.argmax(np.array(values))
ind

3

In [30]:
# results corresponding to the best validation f1-score
results_test[ind]

{'recall': 0.9302325581395349,
 'f1score': 0.862533692722372,
 'mcc': 0.8495359324648414,
 'accuracy': 0.9715083798882681,
 'precision': 0.8982388371777177}

In [31]:
# average training times
np.mean(times_train)

125.54561859500045

In [32]:
# average inference times
np.mean(times_eval)

0.03658247219991608

In [33]:
# average validation f1 scores
valid_f1score_values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(valid_f1score_values)

0.9753840314118459

In [34]:
# average test f1 scores
test_f1score_values = [results_test[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(test_f1score_values)

0.8515012265705206

In [35]:
# average test mcc scores
test_mcc_values = [results_test[i]["mcc"] for i in range(NUM_SIMS)]
np.mean(test_mcc_values)

0.8370191465316941

In [36]:
# average test accuracy scores
test_acc_values = [results_test[i]["accuracy"] for i in range(NUM_SIMS)]
np.mean(test_acc_values)

0.9693296089385475

In [37]:
# average test precision scores
test_precision_values = [results_test[i]["precision"] for i in range(NUM_SIMS)]
np.mean(test_precision_values)

0.8942835365253273

In [38]:
# average test recall scores
test_recall_values = [results_test[i]["recall"] for i in range(NUM_SIMS)]
np.mean(test_recall_values)

0.913372093023256