# Tuning XGBoost

In [3]:
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Import Data

In [4]:
import pandas as pd 
import numpy as np

In [5]:
xtrain = pd.read_csv('scaled_data/xtrain.csv', index_col = 0)
xtest = pd.read_csv('scaled_data/xtest.csv', index_col = 0)
ytrain = pd.read_csv('scaled_data/ytrain.csv', index_col = 0)
ytest = pd.read_csv('scaled_data/ytest.csv', index_col = 0)

In [6]:
train = pd.read_csv('scaled_data/train_scaled.csv', index_col = 0)
test = pd.read_csv('scaled_data/test_scaled.csv', index_col = 0)

In [7]:
# for the labels and the id
train_raw = pd.read_csv('data/train_sample.csv', index_col = 0)
test_raw = pd.read_csv('data/test_sample.csv')

# Hyperparameter Tuning

In [8]:
import optuna
from optuna_integration.xgboost import XGBoostPruningCallback
from sklearn.model_selection import StratifiedKFold

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# define the objective function
def objective(trial: optuna.Trial, 
              xtrain: pd.DataFrame, 
              ytrain: pd.DataFrame, 
              xtest: pd.DataFrame, 
              ytest: pd.DataFrame) -> float:
    # define the hyperparameters to tune
    hyperparams = {
        'objective' : 'binary:logistic',
        'eval_metric' : 'auc',
        'seed' : 123,
        'n_estimators' : trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate' : trial.suggest_float('learning_rate', 1e-8, 1, log=True),
        'max_depth' : trial.suggest_int('max_depth', 5, 15),
        'gamma' : trial.suggest_float('gamma', 0, 20),
        'min_child_weight' : trial.suggest_int('min_child_weight', 2, 20),
        'reg_alpha' : trial.suggest_float('reg_alpha', 1e-2, 0.1),
        'reg_lambda' : trial.suggest_float('reg_lambda', 1e-2, 0.1),
        'subsample' : trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'colsample_bylevel' : trial.suggest_float('colsample_bylevel', 0.5, 0.9),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.5, 0.9),
        'grow_policy' : trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }

    # pruning callback
    pruning_callback = XGBoostPruningCallback(trial, 'validation_0-auc')

    # train model
    xgb_clf = XGBClassifier(
        **hyperparams,
        callbacks = [pruning_callback],
        early_stopping_rounds = 50
    )

    xgb_clf.fit(xtrain, ytrain, eval_set = [(xtest, ytest)], verbose = False)

    y_pred_proba = xgb_clf.predict_proba(xtest)

    roc_auc = roc_auc_score(ytest, y_pred_proba[:, 1])

    return float(roc_auc)

In [10]:
study = optuna.create_study(direction = 'maximize')
study.optimize(
    lambda trial: objective(trial, 
                            xtrain = xtrain, 
                            ytrain = ytrain, 
                            xtest = xtest, 
                            ytest = ytest), 
    n_trials=100)

[I 2024-07-28 20:58:36,848] A new study created in memory with name: no-name-2d55079d-c577-45f3-a548-f2cc04a124f3
[I 2024-07-28 20:59:11,938] Trial 0 finished with value: 0.5352235244360098 and parameters: {'n_estimators': 1479, 'learning_rate': 2.234587385400539e-08, 'max_depth': 9, 'gamma': 6.915696807760066, 'min_child_weight': 4, 'reg_alpha': 0.024620408924814357, 'reg_lambda': 0.07650010731181646, 'subsample': 0.5005594332848446, 'colsample_bytree': 0.8965430661379076, 'colsample_bylevel': 0.6167388886484589, 'colsample_bynode': 0.5930557447065157, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.5352235244360098.
[I 2024-07-28 20:59:14,867] Trial 1 finished with value: 0.861916053262346 and parameters: {'n_estimators': 884, 'learning_rate': 0.0016278441991233945, 'max_depth': 12, 'gamma': 9.361211156441847, 'min_child_weight': 13, 'reg_alpha': 0.07684795494116835, 'reg_lambda': 0.09379747545351755, 'subsample': 0.7427915713547272, 'colsample_bytree': 0.7147805950214091,

In [11]:
print('Best score: ', study.best_trial.value)
print('Best hyperparameters: ', study.best_params)

Best score:  0.8758312584694734
Best hyperparameters:  {'n_estimators': 1427, 'learning_rate': 0.045640623671124717, 'max_depth': 15, 'gamma': 2.1304843387906605, 'min_child_weight': 17, 'reg_alpha': 0.01490102698164227, 'reg_lambda': 0.053395147804257845, 'subsample': 0.8263124090290509, 'colsample_bytree': 0.8685865103401375, 'colsample_bylevel': 0.8566147867891335, 'colsample_bynode': 0.8021382585492708, 'grow_policy': 'lossguide'}


In [12]:
hyperparams = {'n_estimators': 1427, 
               'learning_rate': 0.045640623671124717, 
               'max_depth': 15, 
               'gamma': 2.1304843387906605, 
               'min_child_weight': 17, 
               'reg_alpha': 0.01490102698164227, 
               'reg_lambda': 0.053395147804257845, 
               'subsample': 0.8263124090290509, 
               'colsample_bytree': 0.8685865103401375, 
               'colsample_bylevel': 0.8566147867891335, 
               'colsample_bynode': 0.8021382585492708, 
               'grow_policy': 'lossguide'}