# Tuning XGBoost

In [13]:
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Import Data

In [14]:
import pandas as pd 
import numpy as np

In [15]:
xtrain = pd.read_csv('scaled_data/xtrain.csv', index_col = 0)
xtest = pd.read_csv('scaled_data/xtest.csv', index_col = 0)
ytrain = pd.read_csv('scaled_data/ytrain.csv', index_col = 0)
ytest = pd.read_csv('scaled_data/ytest.csv', index_col = 0)

In [16]:
train = pd.read_csv('scaled_data/train_scaled.csv', index_col = 0)
test = pd.read_csv('scaled_data/test_scaled.csv', index_col = 0)

In [17]:
# for the labels and the id
train_raw = pd.read_csv('data/train_sample.csv', index_col = 0)
test_raw = pd.read_csv('data/test_sample.csv')

# Hyperparameter Tuning

In [18]:
import optuna
from optuna_integration.xgboost import XGBoostPruningCallback
from sklearn.model_selection import StratifiedKFold

In [38]:
# define the objective function
def objective(trial: optuna.Trial, 
              xtrain: pd.DataFrame, 
              ytrain: pd.DataFrame, 
              xtest: pd.DataFrame, 
              ytest: pd.DataFrame) -> float:
    # define the hyperparameters to tune
    hyperparams = {
        'objective' : 'binary:logistic',
        'eval_metric' : 'auc',
        'seed' : 123,
        'n_estimators' : trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate' : trial.suggest_float('learning_rate', 1e-8, 1, log=True),
        'max_depth' : trial.suggest_int('max_depth', 5, 15),
        'gamma' : trial.suggest_float('gamma', 0, 20),
        'min_child_weight' : trial.suggest_int('min_child_weight', 2, 20),
        'reg_alpha' : trial.suggest_float('reg_alpha', 1e-2, 0.1),
        'reg_lambda' : trial.suggest_float('reg_lambda', 1e-2, 0.1),
        'subsample' : trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'colsample_bylevel' : trial.suggest_float('colsample_bylevel', 0.5, 0.9),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.5, 0.9),
        'grow_policy' : trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }

    # pruning callback
    pruning_callback = XGBoostPruningCallback(trial, 'validation_0-auc')

    # train model
    xgb_clf = XGBClassifier(
        **hyperparams,
        callbacks = [pruning_callback],
        early_stopping_rounds = 50
    )

    xgb_clf.fit(xtrain, ytrain, eval_set = [(xtest, ytest)], verbose = False)

    y_pred_proba = xgb_clf.predict_proba(xtest)

    roc_auc = roc_auc_score(ytest, y_pred_proba[:, 1])

    return float(roc_auc)

In [39]:
study = optuna.create_study(direction = 'maximize')
study.optimize(
    lambda trial: objective(trial, 
                            xtrain = xtrain, 
                            ytrain = ytrain, 
                            xtest = xtest, 
                            ytest = ytest), 
    n_trials=100)

[I 2024-07-28 19:38:22,752] A new study created in memory with name: no-name-f516ad8c-4625-4d16-b485-9a300a6eec8d
[I 2024-07-28 19:38:36,986] Trial 0 finished with value: 0.8664476950981816 and parameters: {'n_estimators': 1310, 'learning_rate': 0.12916073761548066, 'max_depth': 13, 'gamma': 18.087232208141522, 'min_child_weight': 3, 'reg_alpha': 0.05567260795753028, 'reg_lambda': 0.08796615128989305, 'subsample': 0.5087137841858322, 'colsample_bytree': 0.6888611198780488, 'colsample_bylevel': 0.5518830483556978, 'colsample_bynode': 0.7215491776686989, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.8664476950981816.
[I 2024-07-28 19:38:48,105] Trial 1 finished with value: 0.8644250294463618 and parameters: {'n_estimators': 1377, 'learning_rate': 0.5843934237334004, 'max_depth': 8, 'gamma': 15.569135052538526, 'min_child_weight': 12, 'reg_alpha': 0.0750618684694755, 'reg_lambda': 0.048757883617773515, 'subsample': 0.5388762190287629, 'colsample_bytree': 0.6228400751764208, '

In [40]:
print('Best score: ', study.best_trial.value)
print('Best hyperparameters: ', study.best_params)

Best score:  0.8718229674624506
Best hyperparameters:  {'n_estimators': 636, 'learning_rate': 0.014053001684708402, 'max_depth': 12, 'gamma': 7.418436070611824, 'min_child_weight': 5, 'reg_alpha': 0.041282529515719035, 'reg_lambda': 0.0791978132786912, 'subsample': 0.8141630488490922, 'colsample_bytree': 0.6946428137265691, 'colsample_bylevel': 0.8315296187267329, 'colsample_bynode': 0.8451838963979114, 'grow_policy': 'lossguide'}


In [1]:
hyperparams = {'n_estimators': 636, 
               'learning_rate': 0.014053001684708402, 
               'max_depth': 12, 
               'gamma': 7.418436070611824, 
               'min_child_weight': 5, 
               'reg_alpha': 0.041282529515719035, 
               'reg_lambda': 0.0791978132786912, 
               'subsample': 0.8141630488490922, 
               'colsample_bytree': 0.6946428137265691, 
               'colsample_bylevel': 0.8315296187267329, 
               'colsample_bynode': 0.8451838963979114, 
               'grow_policy': 'lossguide'}