In [1]:
import pickle
import os
import pandas as pd
from pathlib import Path
from ml_assemblr.main_components.data_pod import DataPod
from ml_assemblr.main_components.data_pod_list import DataPodList
from home_credit_helper.config import cfg
from home_credit_helper.constant import *
import warnings
warnings.filterwarnings("ignore", category=Warning)

In [2]:
from ml_assemblr.transfromer.column_type.features_setter import TopDownFeaturesSetter
from ml_assemblr.transfromer.column_type.column_type_setter import ColumnTypeSetter
from ml_assemblr.transfromer.cross_validator.cross_validator import CrossValidator, get_cv_folds
from sklearn.model_selection import ShuffleSplit

In [3]:
import xgboost as xgb
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
with open(cfg.research_cache_path / "03_dp.pkl", "rb") as f:
    dp: DataPod = pickle.load(f)

In [6]:
selected_features_path = cfg.research_cache_path / "06_selected_features.pkl"
is_full_features = not os.path.exists(selected_features_path)

if not is_full_features:
    with open(selected_features_path, "rb") as f:
        selected_features: list[str] = pickle.load(f)
    
    feature_setters = ColumnTypeSetter(column_type_map={"features": selected_features})
else:
    feature_setters = TopDownFeaturesSetter()

dp = dp.fit_transform(feature_setters)

In [7]:
# config hyperparams tuning

study_name_suffix = "-supset-features"

eval_metric = 'auc'
is_maximize_metric = True
xgb_training_objective="binary:logistic"
early_stopping_rounds = 50

optuna_study_path = f"sqlite:///{cfg.research_cache_path / "optuna_studies.db"}"
# optuna_study_path = "sqlite:///.cache/optuna_studies.db"
study_name = "xgb-hyperparam-tuning" + study_name_suffix
sampler = TPESampler(n_startup_trials=40)
pruner = MedianPruner(n_warmup_steps=20)

In [8]:
cross_validator = CrossValidator(sklearn_cv=ShuffleSplit(n_splits=3, test_size=0.2, random_state=42))
dp = dp.fit_transform(cross_validator)
folds = get_cv_folds(dp)

In [9]:
df = dp.slice_df(split=set(["train", "valid"]), columns=None, table_name=APPLICATIONS)
feature_cols = dp.main_column_type.features
label_col = dp.main_column_type.labels

In [10]:
dtrain = xgb.DMatrix(df[feature_cols], label=df[label_col[0]])

In [11]:
def objective(trial: optuna.trial):
    params = dict(
        # first tier
        learning_rate=trial.suggest_float("param_learning_rate", 0.00001, 1),
        max_depth=trial.suggest_int("param_max_depth", 1, 10),
        subsample=trial.suggest_float("param_subsample", 0.2, 1.0),
        objective=xgb_training_objective,
        # second tier
        num_parallel_tree=trial.suggest_int("param_num_parallel_tree", 1, 10),
        min_child_weight=trial.suggest_float("param_min_child_weight", 0, 20),
        max_delta_step=trial.suggest_float("param_max_delta_step", 0, 10),
        colsample_bylevel=trial.suggest_float("param_colsample_bylevel", 0, 1),
        colsample_bynode=trial.suggest_float("param_colsample_bynode", 0, 1),
        colsample_bytree=trial.suggest_float("param_colsample_bytree", 0, 1),
        min_split_loss=trial.suggest_float("param_min_split_loss", 0, 5),
        max_leaves=trial.suggest_int("param_max_leaves", 3, 50),
        reg_alpha=trial.suggest_float("param_reg_alpha", 0, 5),
        reg_lambda=trial.suggest_float("param_reg_lambda", 0, 5),
    )

    trial.set_user_attr("eval_metric", eval_metric)
    trial.set_user_attr("param_objective", xgb_training_objective)

    num_rounds = trial.suggest_int("num_rounds", 20, 500, log=False)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f"test-{eval_metric}")

    cv_results: pd.DataFrame = xgb.cv(
        params=params,
        dtrain=dtrain,
        folds=folds,
        num_boost_round=num_rounds,
        metrics=eval_metric,
        callbacks=[pruning_callback],
        early_stopping_rounds=early_stopping_rounds,
        maximize=is_maximize_metric,
        verbose_eval=False,
        as_pandas=True,
    )

    best_iter = int(cv_results[f"test-{eval_metric}-mean"].argmax())

    trial.set_user_attr("is_maximize_metric", is_maximize_metric)

    trial.set_user_attr("num_rounds", best_iter)
    trial.set_user_attr(
        "train_metric_mean_at_trial_end",
        float(cv_results[f"train-{eval_metric}-mean"].iloc[best_iter]),
    )
    trial.set_user_attr(
        "train_std_at_trial_end",
        float(cv_results[f"train-{eval_metric}-std"].iloc[best_iter]),
    )
    trial.set_user_attr(
        "val_std_at_trial_end",
        float(cv_results[f"test-{eval_metric}-std"].iloc[best_iter]),
    )

    metric = float(cv_results[f"test-{eval_metric}-mean"].iloc[best_iter])
    return metric

In [12]:
study = optuna.create_study(
    storage=optuna_study_path,
    study_name=study_name,
    sampler=sampler,
    pruner=pruner,
    load_if_exists=True,
    direction="maximize",
)

[I 2024-04-21 22:18:30,767] Using an existing study with name 'xgb-hyperparam-tuning-supset-features' instead of creating a new one.


In [13]:
raise

RuntimeError: No active exception to reraise

In [None]:
study.optimize(
    objective,
    n_trials=100,
    n_jobs=2,
    # n_jobs=-1,
)

In [14]:
study.best_trial.params

{'param_learning_rate': 0.20072683838790215,
 'param_max_depth': 10,
 'param_subsample': 0.7148295489790479,
 'param_num_parallel_tree': 7,
 'param_min_child_weight': 8.100500759676423,
 'param_max_delta_step': 9.857091261659203,
 'param_colsample_bylevel': 0.4826516588867641,
 'param_colsample_bynode': 0.7289133194453435,
 'param_colsample_bytree': 0.7238114214160121,
 'param_min_split_loss': 0.524271049652073,
 'param_max_leaves': 34,
 'param_reg_alpha': 1.3968990877191585,
 'param_reg_lambda': 2.885833987262361,
 'num_rounds': 245}

In [15]:
study.best_trial.user_attrs

{'eval_metric': 'auc',
 'is_maximize_metric': True,
 'num_rounds': 209,
 'param_objective': 'binary:logistic',
 'train_metric_mean_at_trial_end': 0.8956501191099603,
 'train_std_at_trial_end': 0.00037159148216916153,
 'val_std_at_trial_end': 0.0012020103476224286}

In [16]:
study.best_trial._values

[0.7866482656529904]