# Tuning Stochastic Gradient Boosting

In [33]:
import pandas as pd 
import numpy as np 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import optuna

In [34]:
xtrain = pd.read_csv('scaled_data/xtrain.csv', index_col = 0)
xtest = pd.read_csv('scaled_data/xtest.csv', index_col = 0)
ytrain = pd.read_csv('scaled_data/ytrain.csv', index_col = 0)
ytest = pd.read_csv('scaled_data/ytest.csv', index_col = 0)

In [35]:
ytest = ytest.to_numpy().flatten()
ytest.shape

(172572,)

In [36]:
ytrain = ytrain.to_numpy().flatten()
ytrain.shape

(402668,)

In [37]:
train = pd.read_csv('scaled_data/train_scaled.csv', index_col = 0)
test = pd.read_csv('scaled_data/test_scaled.csv', index_col = 0)

In [38]:
# for the labels and the id
train_raw = pd.read_csv('data/train_sample.csv', index_col = 0)
test_raw = pd.read_csv('data/test_sample.csv')

In [39]:
# define the objective function
def objective(trial: optuna.Trial, 
              xtrain: pd.DataFrame, 
              ytrain: pd.DataFrame, 
              xtest: pd.DataFrame, 
              ytest: pd.DataFrame) -> float:
    
    # define the hyperparameters to tune
    hyperparams = {
        'random_state' : 123,
        'n_estimators' : trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate' : trial.suggest_float('learning_rate', 1e-8, 1, log=True),
        'max_depth' : trial.suggest_int('max_depth', 5, 15),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1, 20),
        'subsample' : trial.suggest_float('subsample', 0.5, 0.9),
        'max_features' : trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    }

    # train model
    sgbm_clf = GradientBoostingClassifier(**hyperparams)

    sgbm_clf.fit(xtrain, 
                 ytrain, 
                 # eval_set = [(xtest, ytest)], 
                 # early_stopping_rounds = 50
    )

    y_pred_proba = sgbm_clf.predict_proba(xtest)

    roc_auc = roc_auc_score(ytest, y_pred_proba[:, 1])

    return float(roc_auc)

In [40]:
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, 
                                       xtrain = xtrain,
                                       xtest = xtest,
                                       ytrain = ytrain, 
                                       ytest = ytest),
                n_trials = 50)

[I 2024-07-28 20:01:36,609] A new study created in memory with name: no-name-93946c97-a321-47b9-b30e-a6b32aacd080
[I 2024-07-28 20:12:31,857] Trial 0 finished with value: 0.8534201132450707 and parameters: {'n_estimators': 1094, 'learning_rate': 0.10418244058466201, 'max_depth': 14, 'min_samples_split': 8, 'min_samples_leaf': 1, 'subsample': 0.5347283795166015, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8534201132450707.
[I 2024-07-28 20:37:09,073] Trial 1 finished with value: 0.8648653078709446 and parameters: {'n_estimators': 1494, 'learning_rate': 9.141062667777569e-05, 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 5, 'subsample': 0.8850497055161204, 'max_features': 'log2'}. Best is trial 1 with value: 0.8648653078709446.
[W 2024-07-28 20:37:09,078] Trial 2 failed with parameters: {'n_estimators': 1190, 'learning_rate': 0.00021526578595847646, 'max_depth': 6, 'min_samples_split': 19, 'min_samples_leaf': 18, 'subsample': 0.6878176517517516, 'max_features':

InvalidParameterError: The 'max_features' parameter of GradientBoostingClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.

In [None]:
hyperparams = {
     'n_estimators': 1494, 
     'learning_rate': 9.141062667777569e-05, 
     'max_depth': 15, 
     'min_samples_split': 10, 
     'min_samples_leaf': 5, 
     'subsample': 0.8850497055161204, 
     'max_features': 'log2'
}