In [1]:
import time
from pathlib import Path
import pandas as pd
import numpy as np

import optuna as opt
from optuna.samplers import TPESampler
# suppress info logs
opt.logging.set_verbosity(opt.logging.WARNING)

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error

from xgboost import XGBRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RANDOM_SEED = 760
DATA_DIR = Path("../../../ready_data")

N_OPTUNA_TRIALS = 50
K_FOLDS = 3 # for both outer and inner cv
T_ES = 20 # threshold # consecutive non-improvement rounds for early stopping

df_train = pd.read_parquet(DATA_DIR/"100K18F_train_main.parquet.snappy")
df_test = pd.read_parquet(DATA_DIR/"100K18F_test_main.parquet.snappy")

X_train, y_train = df_train.drop(['r_useful', 'r_id'], axis=1).values, df_train['r_useful'].values
X_test, y_test = df_test.drop(['r_useful', 'r_id'], axis=1).values, df_test['r_useful'].values

print(f"Shape of the training data : {X_train.shape}")
print(f"Shape of the test data : {X_test.shape}")

Shape of the training data : (80000, 18)
Shape of the test data : (20000, 18)


In [3]:
MODEL_PREFIX = "xgb"
XGB_model = XGBRegressor(
    booster="gbtree",
    n_jobs=-1, # use all CPUs.
    tree_method="gpu_hist", # use GPU
    predictor="gpu_predictor",
    objective="reg:squarederror",
    eval_metric=["rmse"],
    random_state=RANDOM_SEED
)
# Mean imputation and standardisation
model_pipe = Pipeline([
    ("imp", SimpleImputer()),
    ("ss", StandardScaler()),
    (MODEL_PREFIX, XGB_model)])

In [4]:
def hp_appender(hp_dict):
    '''Return dictionary where every key has the MODEL_PREFIX__ appended.'''
    new_dict = {}
    for key, val in hp_dict.items():
        new_dict[MODEL_PREFIX + "__" + key] = val
    return new_dict

In [5]:
def optuna_objective(trial, model, X, y):
    print(f"{time.strftime('%H:%M:%S', time.localtime())} | Running Optuna Trial: {trial.number}")
    
    # sample hyperparameters from optuna
    hyperparams = {
        "n_estimators":trial.suggest_int('n_estimators', 1, 501, step=5),
        "learning_rate" : trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
        "max_depth" : trial.suggest_int("max_depth", 2, 20),
        # subsample of observations for each iteration
        "subsample": trial.suggest_float("subsample", 0.5, 1, step=0.1),
        # subsample of features for each iteration
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1, step=0.1),
        "gamma": trial.suggest_float("gamma", 0, 1),
    }
    print(f"with hyperparameters: {hyperparams}")
    hyperparams = hp_appender(hyperparams)
    model.set_params(**hyperparams)

    # Inner CV Loop
    avg_score = -cross_val_score(model, X, y,
        scoring="neg_root_mean_squared_error", cv=KFold(K_FOLDS)).mean()
    print(f"complete! average cv RMSE: {avg_score}")
    return avg_score

In [6]:
class early_stopping_check_callback:
    def __init__(self, threshold):
        self.threshold = threshold

    def __call__(self, study, trial):
        # stop study if the number of consecutive trials with no improvement is
        # at least the threshold.
        if trial.number - study.best_trial.number >= self.threshold:
            print("==== EARLY STOPPING ACTIVATED ====")
            study.stop()

In [7]:
early_stopping_cb = early_stopping_check_callback(T_ES)
out_cv = KFold(K_FOLDS)
outer_cv_results = {
    "rmse": [],
    "mae": [],
    "opt_hp": []
}

# Outer CV Loop
for cv_train_ii, cv_val_ii in out_cv.split(X_train, y_train):
    # extract outer cv data for this fold
    cv_X_train, cv_y_train = X_train[cv_train_ii], y_train[cv_train_ii]
    cv_X_val, cv_y_val = X_train[cv_val_ii], y_train[cv_val_ii]
    
    # Optuna Loop (encloses Inner CV Loop)
    # note: inner cv should be within since the hyperparemeter sampling changes
    # based on model performance, which changes based on data.
    study = opt.create_study(direction='minimize', sampler=TPESampler(seed=RANDOM_SEED))
    study.optimize(
        lambda trial: optuna_objective(trial, model_pipe, cv_X_train, cv_y_train),
        n_trials=N_OPTUNA_TRIALS,
        callbacks=[early_stopping_cb]) # early stopping
    best_params = hp_appender(study.best_params)

    # evaluate model with best hyperparameters according to Inner CV Loop
    model_pipe.set_params(**best_params)
    # fit on all training data for this fold
    model_pipe.fit(cv_X_train, cv_y_train)
    # predict on all validation data for this fold
    y_preds = model_pipe.predict(cv_X_val)

    # calculate scores
    rmse = mean_squared_error(cv_y_val, y_preds, squared=False)
    mae = mean_absolute_error(cv_y_val, y_preds)

    # save results for this iteration
    outer_cv_results["rmse"].append(rmse)
    outer_cv_results["mae"].append(mae)
    outer_cv_results["opt_hp"].append(best_params)

16:08:31 | Running Optuna Trial: 0
with hyperparameters: {'n_estimators': 61, 'learning_rate': 0.0021542480108802785, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0.8606400505757477}
complete! average cv RMSE: 4.197786098204431
16:08:31 | Running Optuna Trial: 1
with hyperparameters: {'n_estimators': 81, 'learning_rate': 0.3336605688950222, 'max_depth': 14, 'subsample': 1.0, 'colsample_bytree': 0.5, 'gamma': 0.6755211563859722}
complete! average cv RMSE: 3.370926961845576
16:08:38 | Running Optuna Trial: 2
with hyperparameters: {'n_estimators': 221, 'learning_rate': 0.006094075338394031, 'max_depth': 20, 'subsample': 0.5, 'colsample_bytree': 0.7, 'gamma': 0.6522814819365693}
complete! average cv RMSE: 3.2634134811423414
16:10:59 | Running Optuna Trial: 3
with hyperparameters: {'n_estimators': 141, 'learning_rate': 0.07136913218405677, 'max_depth': 11, 'subsample': 0.8, 'colsample_bytree': 0.5, 'gamma': 0.9369507683216501}
complete! average cv RMSE: 3.049039972749

In [8]:
print(outer_cv_results)
print(f"mean nested CV RMSE: {np.mean(outer_cv_results['rmse'])}")
print(f"mean nested CV MAE: {np.mean(outer_cv_results['mae'])}")

{'rmse': [3.1575323846398047, 2.9676442742129256, 2.8074688162979955], 'mae': [1.330561903606014, 1.3387561220663553, 1.3254657899221154], 'opt_hp': [{'xgb__n_estimators': 441, 'xgb__learning_rate': 0.0177769529067458, 'xgb__max_depth': 9, 'xgb__subsample': 0.8, 'xgb__colsample_bytree': 0.9, 'xgb__gamma': 0.5853027873801265}, {'xgb__n_estimators': 406, 'xgb__learning_rate': 0.016799033020970314, 'xgb__max_depth': 8, 'xgb__subsample': 0.6, 'xgb__colsample_bytree': 1.0, 'xgb__gamma': 0.7215751121387509}, {'xgb__n_estimators': 316, 'xgb__learning_rate': 0.03941859847561101, 'xgb__max_depth': 8, 'xgb__subsample': 0.7, 'xgb__colsample_bytree': 1.0, 'xgb__gamma': 0.7395488321838755}]}
mean nested CV RMSE: 2.977548491716908
mean nested CV MAE: 1.3315946051981615
