In [None]:
import time
from pathlib import Path
import pandas as pd
import numpy as np

import optuna as opt
from optuna.samplers import TPESampler
# suppress info logs
opt.logging.set_verbosity(opt.logging.WARNING)

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error

from xgboost import XGBRegressor

In [None]:
RANDOM_SEED = 760
DATA_DIR = Path("../../../ready_data")

N_OPTUNA_TRIALS = 50
K_FOLDS = 3 # for both outer and inner cv
T_ES = 20 # threshold # consecutive non-improvement rounds for early stopping

df_train = pd.read_parquet(DATA_DIR/"100K18F_train_main.parquet.snappy")
df_test = pd.read_parquet(DATA_DIR/"100K18F_test_main.parquet.snappy")

print(f"Shape of the training data : {df_train.shape}")
print(f"Shape of the test data : {df_test.shape}")

X_train, y_train = df_train.drop(['r_useful', 'r_id'], axis=1).values, df_train['r_useful'].values
X_test, y_test = df_test.drop(['r_useful', 'r_id'], axis=1).values, df_test['r_useful'].values

In [None]:
XGB_model = XGBRegressor(
    booster="gbtree",
    n_jobs=-1, # use all CPUs.
    tree_method="gpu_hist", # use GPU
    predictor="gpu_predictor",
    objective="reg:squarederror",
    eval_metric=["rmse"],
    random_state=RANDOM_SEED
)
# Mean imputation and standardisation
XGB_pipe = Pipeline([
    ("imp", SimpleImputer()),
    ("ss", StandardScaler()),
    ("xgb", XGB_model)])

In [None]:
def XGB_hp_appender(hp_dict):
    '''Return dictionary where every key has "xgb__" appended.'''
    new_dict = {}
    for key, val in hp_dict.items():
        new_dict["xgb__" + key] = val
    return new_dict

In [None]:
def optuna_objective(trial, model, X, y):
    print(f"{time.strftime('%H:%M:%S', time.localtime())} | Running Optuna Trial: {trial.number}")
    
    # sample hyperparameters from optuna
    hyperparams = {
        "n_estimators":trial.suggest_int('n_estimators', 1, 501, step=25),
        "learning_rate" : trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
        "max_depth" : trial.suggest_int("max_depth", 2, 20),
        # subsample of observations for each iteration
        "subsample": trial.suggest_float("subsample", 0.5, 1, step=0.1),
        # subsample of features for each iteration
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1, step=0.1),
        "gamma": trial.suggest_float("gamma", 0, 1),
    }
    print(f"with hyperparameters: {hyperparams}")
    hyperparams = XGB_hp_appender(hyperparams)
    model.set_params(**hyperparams)

    # Inner CV Loop
    avg_score = -cross_val_score(model, X, y,
        scoring="neg_root_mean_squared_error", cv=KFold(K_FOLDS)).mean()
    print(f"complete! average cv RMSE: {avg_score}")
    return avg_score

In [None]:
class early_stopping_check_callback:
    def __init__(self, threshold):
        self.threshold = threshold

    def __call__(self, study, trial):
        # stop study if the number of consecutive trials with no improvement is
        # at least the threshold.
        if trial.number - study.best_trial.number >= self.threshold:
            print("==== EARLY STOPPING ACTIVATED ====")
            study.stop()

In [None]:
early_stopping_cb = early_stopping_check_callback(T_ES)
out_cv = KFold(K_FOLDS)
outer_cv_results = {
    "rmse": [],
    "mae": [],
    "opt_hp": []
}

# Outer CV Loop
for cv_train_ii, cv_val_ii in out_cv.split(X_train, y_train):
    # extract outer cv data for this fold
    cv_X_train, cv_y_train = X_train[cv_train_ii], y_train[cv_train_ii]
    cv_X_val, cv_y_val = X_train[cv_val_ii], y_train[cv_val_ii]
    
    # Optuna Loop (encloses Inner CV Loop)
    # note: inner cv should be within since the hyperparemeter sampling changes
    # based on model performance, which changes based on data.
    study = opt.create_study(direction='minimize', sampler=TPESampler(seed=RANDOM_SEED))
    study.optimize(
        lambda trial: optuna_objective(trial, XGB_pipe, cv_X_train, cv_y_train),
        n_trials=N_OPTUNA_TRIALS,
        callbacks=[early_stopping_cb]) # early stopping
    best_params = XGB_hp_appender(study.best_params)

    # evaluate model with best hyperparameters according to Inner CV Loop
    XGB_pipe.set_params(**best_params)
    # fit on all training data for this fold
    XGB_pipe.fit(cv_X_train, cv_y_train)
    # predict on all validation data for this fold
    y_preds = XGB_pipe.predict(cv_X_val)

    # calculate scores
    rmse = mean_squared_error(cv_y_val, y_preds, squared=False)
    mae = mean_absolute_error(cv_y_val, y_preds)

    # save results for this iteration
    outer_cv_results["rmse"].append(rmse)
    outer_cv_results["mae"].append(mae)
    outer_cv_results["opt_hp"].append(best_params)

In [None]:
print(outer_cv_results)
print(f"mean nested CV RMSE: {np.mean(outer_cv_results['rmse'])}")
print(f"mean nested CV MAE: {np.mean(outer_cv_results['mae'])}")