# XGBoost

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import recall_score
from sklearn.model_selection import RandomizedSearchCV
from timeit import default_timer
import datetime
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from sklearn.model_selection import PredefinedSplit
#from sklearn.model_selection import cross_val_score




print(f"Notebook last run (end-to-end): {datetime.datetime.now()}")

Notebook last run (end-to-end): 2024-03-11 12:12:19.846238


In [2]:
SEED_ORIG = 2024
SEED_NP = 2023
SEED = SEED_ORIG
NUM_SIMS = 10
rstate = np.random.default_rng(SEED_NP)

# Hyperparameter tuning by Hyperopt

In [3]:
df = pd.read_csv("../data/HTRU_2.csv",sep=",",header=None)
labels = df[8]
df = df.drop([8],axis=1)
df["Labels"] = labels
df["Labels"].value_counts()

0    16259
1     1639
Name: Labels, dtype: int64

In [4]:
X_train_, X_test = train_test_split(df, test_size=0.1, random_state=SEED_ORIG) # Splitting the test set
#y_train_ = X_train_["Labels"]
#X_train_ = X_train_.drop("Labels",axis=1)

In [5]:
X_train, X_valid = train_test_split(X_train_, test_size=0.1/0.9, random_state=SEED_ORIG) 
cols = X_train.columns[:-1]


In [6]:
len(X_train), len(X_valid), len(X_test)


(14318, 1790, 1790)

In [7]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,Labels
10814,134.507812,48.756089,-0.031636,-0.131966,0.764214,11.915867,18.587707,369.896839,0
8686,113.007812,55.334847,0.127174,0.012195,2.903010,16.116651,7.873506,79.343598,0
6211,141.195312,41.906529,0.042273,0.392180,2.548495,16.650554,8.976223,95.091538,0
1162,120.062500,48.030386,-0.028265,0.028709,30.127926,62.095604,1.827059,1.878257,0
2750,125.609375,51.790780,-0.016635,-0.091361,18.428094,52.283802,2.765830,6.372460,0
...,...,...,...,...,...,...,...,...,...
8685,126.125000,42.971284,-0.059258,0.265109,2.623746,19.325817,8.561876,80.169221,0
1354,113.398438,40.149397,0.322763,0.532786,2.985786,16.622323,8.176916,83.887394,0
15039,147.390625,43.347296,-0.142813,0.241722,2.954849,18.619306,8.543222,83.139408,0
16869,121.476562,46.342469,0.148239,0.185344,12.081940,38.737798,3.231075,9.644188,0


In [8]:
X_train["Labels"].value_counts(), X_valid["Labels"].value_counts(), X_test["Labels"].value_counts()

(0    13014
 1     1304
 Name: Labels, dtype: int64,
 0    1627
 1     163
 Name: Labels, dtype: int64,
 0    1618
 1     172
 Name: Labels, dtype: int64)

In [9]:
y_train = X_train["Labels"]
y_valid = X_valid["Labels"]
y_test = X_test["Labels"]
X_train = X_train.drop("Labels",axis=1)
X_valid = X_valid.drop("Labels",axis=1)
X_test = X_test.drop("Labels",axis=1)


In [10]:
len(X_train), len(X_valid)

(14318, 1790)

In [11]:
X_train_ = pd.concat([X_train,X_valid],axis=0,ignore_index=True)

In [12]:
y_train_ = pd.concat([y_train,y_valid],axis=0,ignore_index=True)

In [13]:
len(y_train_)

16108

In [14]:
# Creating the validation fold
valid_fold = np.concatenate([
    -np.ones(len(X_train)), # Training set
    np.zeros(len(X_valid)) # Validation set
])

In [15]:
valid_fold[:10], valid_fold[-10:]

(array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [16]:
ps = PredefinedSplit(test_fold=valid_fold)

In [17]:
#!pip install hyperopt

In [18]:
# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,10)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100])
}

In [19]:
# Objective function
def objective(params):

    xgboost = XGBClassifier(seed=SEED, random_state=SEED, **params)
   
    xgboost.fit(X_train,y_train)
    y_pred = xgboost.predict(X_valid)
    score = f1_score(y_valid,y_pred)

    # Loss is negative score
    loss = - score

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [20]:
best_models = []
times_train = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    start_time = default_timer()
    SEED = SEED_ORIG + i
    best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 10, trials = Trials(), rstate=rstate)
    best_models.append(best)
    end_time = default_timer()
    times_train.append(end_time-start_time)
    print(f"Training time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
100%|██████████| 10/10 [02:45<00:00, 16.51s/trial, best loss: -0.8980891719745223]
Training time: 165.14s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
100%|██████████| 10/10 [00:35<00:00,  3.53s/trial, best loss: -0.8996763754045307]
Training time: 35.36s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
100%|██████████| 10/10 [01:28<

In [21]:
# best hyperparameters
best

{'colsample_bytree': 5,
 'gamma': 5,
 'learning_rate': 3,
 'max_depth': 4,
 'reg_alpha': 2,
 'reg_lambda': 1}

In [22]:
results_valid = []
results_test = []
times_eval = []
for i in range(NUM_SIMS):
    print(80*"-")
    print(80*"-")
    print(f"Sim id: {i}")
    print(80*"-")
    
    SEED = SEED_ORIG + i
    
    # Train model using the best parameters
    xgboost_bo = XGBClassifier(
                           seed=SEED,
                           random_state=SEED,
                           colsample_bytree=space_eval(space, best_models[i])['colsample_bytree'],
                           gamma=space_eval(space, best_models[i])['gamma'],
                           learning_rate=space_eval(space, best_models[i])['learning_rate'],
                           max_depth=space_eval(space, best_models[i])['max_depth'],
                           reg_alpha=space_eval(space, best_models[i])['reg_alpha'],
                           reg_lambda=space_eval(space, best_models[i])['reg_lambda']
                           ).fit(X_train,y_train)
    
    start_time = default_timer()
    y_pred = xgboost_bo.predict(X_valid)
    recall = recall_score(y_valid, y_pred)
    f1score = f1_score(y_valid,y_pred)
    mcc = matthews_corrcoef(y_valid, y_pred)
    acc = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_valid.append(results_temp)
    
    y_pred = xgboost_bo.predict(X_test)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test,y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    results_temp = {}
    results_temp["recall"] = recall
    results_temp["f1score"] = f1score
    results_temp["mcc"] = mcc
    results_temp["accuracy"] = acc
    results_temp["precision"] = precision
    results_test.append(results_temp)
    
    end_time = default_timer()
    times_eval.append(end_time-start_time)
    print(f"Evaluation time: {end_time-start_time:.2f}s")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 0
--------------------------------------------------------------------------------
Evaluation time: 0.03s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 1
--------------------------------------------------------------------------------
Evaluation time: 0.03s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 2
--------------------------------------------------------------------------------
Evaluation time: 0.03s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Sim id: 3


In [23]:
results_test

[{'recall': 0.8895348837209303,
  'f1score': 0.9134328358208955,
  'mcc': 0.9049006223768028,
  'accuracy': 0.9837988826815642,
  'precision': 0.9634861859495251},
 {'recall': 0.8546511627906976,
  'f1score': 0.901840490797546,
  'mcc': 0.8936888405772987,
  'accuracy': 0.982122905027933,
  'precision': 0.9696321404756613},
 {'recall': 0.877906976744186,
  'f1score': 0.9041916167664672,
  'mcc': 0.8948309690307236,
  'accuracy': 0.982122905027933,
  'precision': 0.9595997512664179},
 {'recall': 0.8662790697674418,
  'f1score': 0.9030303030303031,
  'mcc': 0.8941802311286007,
  'accuracy': 0.982122905027933,
  'precision': 0.9644724187143212},
 {'recall': 0.8488372093023255,
  'f1score': 0.8984615384615385,
  'mcc': 0.8902022267257853,
  'accuracy': 0.9815642458100559,
  'precision': 0.9691828268672569},
 {'recall': 0.8895348837209303,
  'f1score': 0.9080118694362018,
  'mcc': 0.8986936416833187,
  'accuracy': 0.9826815642458101,
  'precision': 0.9577902097902098},
 {'recall': 0.8720930

In [24]:
times_train

[165.14031284999874,
 35.35648063199915,
 88.86159053099982,
 35.28142888900038,
 105.19526144699921,
 72.31242844400003,
 67.64550412899916,
 9.805392934998963,
 93.2583920470006,
 13.720882215000529]

In [25]:
results_valid

[{'recall': 0.8650306748466258,
  'f1score': 0.8980891719745223,
  'mcc': 0.889087162556632,
  'accuracy': 0.982122905027933,
  'precision': 0.9601760078225698},
 {'recall': 0.852760736196319,
  'f1score': 0.8996763754045307,
  'mcc': 0.8918478438664144,
  'accuracy': 0.9826815642458101,
  'precision': 0.9687281271872813},
 {'recall': 0.8773006134969326,
  'f1score': 0.9079365079365079,
  'mcc': 0.8997259313498847,
  'accuracy': 0.9837988826815642,
  'precision': 0.9642897307370992},
 {'recall': 0.8650306748466258,
  'f1score': 0.9009584664536742,
  'mcc': 0.8924120210332045,
  'accuracy': 0.9826815642458101,
  'precision': 0.9632926829268292},
 {'recall': 0.852760736196319,
  'f1score': 0.9025974025974026,
  'mcc': 0.8952939309350089,
  'accuracy': 0.9832402234636871,
  'precision': 0.9720155120008385},
 {'recall': 0.8711656441717791,
  'f1score': 0.9015873015873016,
  'mcc': 0.8927598793886906,
  'accuracy': 0.9826815642458101,
  'precision': 0.9606950067476383},
 {'recall': 0.889570

In [26]:
# validation f1-scores
values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]

In [27]:
# the best f1-score model
ind = np.argmax(np.array(values))
ind, np.max(np.array(values))

(6, 0.9177215189873417)

In [28]:
# results corresponding to the best validation f1-score
results_test[ind]

{'recall': 0.872093023255814,
 'f1score': 0.9009009009009009,
 'mcc': 0.8913411329222705,
 'accuracy': 0.9815642458100559,
 'precision': 0.9590859003542165}

In [29]:
# average training times
np.mean(times_train)

68.65776741189966

In [30]:
# average inference times
np.mean(times_eval)

0.03126396169936925

In [31]:
# average validation f1 scores
valid_f1score_values = [results_valid[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(valid_f1score_values)

0.9045853067050753

In [32]:
# average test f1 scores
test_f1score_values = [results_test[i]["f1score"] for i in range(NUM_SIMS)]
np.mean(test_f1score_values)

0.904790669172925

In [33]:
# average test mcc scores
test_mcc_values = [results_test[i]["mcc"] for i in range(NUM_SIMS)]
np.mean(test_mcc_values)

0.8960740767597084

In [34]:
# average test accuracy scores
test_acc_values = [results_test[i]["accuracy"] for i in range(NUM_SIMS)]
np.mean(test_acc_values)

0.9824022346368715

In [35]:
# average test precision scores
test_precision_values = [results_test[i]["precision"] for i in range(NUM_SIMS)]
np.mean(test_precision_values)

0.9641244658280808

In [36]:
# average test recall scores
test_recall_values = [results_test[i]["recall"] for i in range(NUM_SIMS)]
np.mean(test_recall_values)

0.8709302325581396