# Hyper-parameter tuning – Regression Dataset  
This notebook finds the best parameters for:

* **Bagging** RandomForestRegressor  
* **Boosting** XGBRegressor  
* **Stack meta learner** Ridge α in the RF+XGB+OLS stack

The search spaces are small enough for GridSearchCV; swap in Optuna if you
need more power. Results are saved to `configs/regression.yaml`.

## Setup & Imports

In [None]:
import yaml, pathlib, json, optuna

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import mean_squared_error
from optuna.visualization import plot_optimization_history

from bagging_boosting_stacking_study.constants import SEED
from bagging_boosting_stacking_study.data.loaders import load_dataset

DATASET_NAME = "regression"

In [2]:
# dark theme across plots
pio.templates.default = "plotly_dark"
plt.style.use("dark_background")

## Load Clean Dataset

In [3]:
df = load_dataset(DATASET_NAME, raw=False)

### train test split

In [4]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED)

In [5]:
train_df.shape

(900, 10)

In [6]:
test_df.shape

(100, 10)

## Bagging

### pipeline



### hyper-parameter tuning

In [7]:
# Split X and y
y = train_df["target"].values
X = train_df.drop(columns="target").values

# Cross-validation strategy: 5-fold, repeated twice (10 scores / trial)
cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=SEED)


# Objective function for Optuna
def objective(trial):
    # Hyper-parameter search space

    # (a) Ensemble size
    # NOTE: log=True lets the TPE sampler prefer multiplicative jumps (50 -> 100 ->
    # 200 -> 400 -> 600) instead of linear ones.
    n_estimators = trial.suggest_categorical("n_estimators", [50, 100, 200, 400, 600])

    # (b) Depth: cap or unlimited
    # NOTE: A binary switch (“cap or not”) keeps unlimited depth in play but also
    # explores explicit caps 3 – 20. Optuna learns which branch is better and spends
    # more trials there.
    if trial.suggest_categorical("cap_depth", [True, False]):
        max_depth = trial.suggest_int("max_depth", 3, 20)
    else:
        max_depth = None

    # (c) Tree-shape regularisation
    # NOTE: Bigger numbers prune small, noisy branches.
    min_samples_split = trial.suggest_int("min_samples_split", 2, 16)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 8)

    # (d) Feature subsampling (9 predictors -> all predictors)
    max_features = trial.suggest_int("max_features", 3, 9, step=1)

    # (e) Bootstrap & row subsampling
    # NOTE:
    # - `bootstrap=False`: Every tree sees the entire training set once (no resampling).
    # - `bootstrap=True`: The tree is trained on a bootstrap sample: 900 rows are drawn
    # with replacement from training set (some rows appear multiple times, ≈36 % are
    # left out). Bagging de-correlates the trees, usually lowering variance. The
    # “left-out” rows automatically form an out-of-bag (OOB) set, that can be later used
    # to calculate quick generalisation score by setting oob_score=True (it's not
    # possible when `bootstrap=False`).
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])
    max_samples = (
        trial.suggest_float("max_samples", 0.6, 1.0, step=0.1) if bootstrap else None
    )

    # (f) Impurity, pruning
    criterion = trial.suggest_categorical(
        "criterion", ["squared_error", "absolute_error"]
    )
    # NOTE: `ccp_alpha` removes branches whose benefit in impurity reduction is smaller 
    # than an alpha-weighted penalty for model size. Larger values -> more nodes pruned, 
    # shallower trees.
    ccp_alpha = trial.suggest_float("ccp_alpha", 1e-6, 1e-3, log=True)

    # (g) Out-of-bag score only legal when bootstrap
    oob_score = bootstrap and trial.suggest_categorical("oob_score", [True, False])

    # Model
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        max_samples=max_samples,
        criterion=criterion,
        ccp_alpha=ccp_alpha,
        oob_score=oob_score,
        n_jobs=-1,
        random_state=SEED,
    )

    # CV evaluation
    # NOTE: scikit-learn returns negative RMSE so I negate “neg_root_mean_squared_error”
    # again to get a positive value to minimise.
    rmse = -cross_val_score(
        rf, X=X, y=y, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1
    ).mean()

    return rmse


# Create & run the study
study_rf = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=SEED),  # reproducible search
    pruner=HyperbandPruner(),  # aggressive early stopping
    study_name="RF_reproducible",
)

# Robust search: either n_trials or timeout
study_rf.optimize(objective, n_trials=200, show_progress_bar=True)
# study.optimize(objective, timeout=3600, show_progress_bar=True)

# Inspect the result
print("Best RMSE :", study_rf.best_value)
print("Best parameters:", study_rf.best_params)

[I 2025-05-19 23:01:32,069] A new study created in memory with name: RF_reproducible


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-05-19 23:01:34,803] Trial 0 finished with value: 97.56223968355566 and parameters: {'n_estimators': 100, 'cap_depth': False, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 9, 'bootstrap': False, 'criterion': 'squared_error', 'ccp_alpha': 3.338880395528818e-06}. Best is trial 0 with value: 97.56223968355566.
[I 2025-05-19 23:01:42,871] Trial 1 finished with value: 86.96821861269207 and parameters: {'n_estimators': 600, 'cap_depth': True, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 4, 'bootstrap': False, 'criterion': 'absolute_error', 'ccp_alpha': 5.4410212245202764e-06}. Best is trial 1 with value: 86.96821861269207.
[I 2025-05-19 23:01:43,467] Trial 2 finished with value: 63.615905176593955 and parameters: {'n_estimators': 200, 'cap_depth': False, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 4, 'bootstrap': False, 'criterion': 'squared_error', 'ccp_alpha': 1.3435855122067693e-06}. Best is trial 2 with value: 6

In [21]:
fig = plot_optimization_history(study_rf)
fig.show()

In [8]:
print("Best RMSE :", study_rf.best_value)

Best RMSE : 60.56165065016895


In [9]:
study_rf.best_params

{'n_estimators': 600,
 'cap_depth': False,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 7,
 'bootstrap': True,
 'max_samples': 1.0,
 'criterion': 'squared_error',
 'ccp_alpha': 0.00048444934038558216,
 'oob_score': True}

In [17]:
# Copy Optuna’s best parameters
best_rf_params = study_rf.best_params.copy()
cap_flag  = best_rf_params.pop("cap_depth")          # remove it; returns True/False
max_depth = best_rf_params.pop("max_depth", None)    # may or may not exist

best_rf_params["max_depth"] = max_depth if cap_flag else None
best_rf_params.update(
    {
        "random_state": SEED,   # full reproducibility
        "n_jobs": -1,           # or 1 for bit-perfect runs
    }
)

# Fit on all 900 training rows
rf_final = RandomForestRegressor(**best_rf_params)
rf_final.fit(X, y)

## Boosting

In [10]:
# Split X and y
y = train_df["target"].values
X = train_df.drop(columns="target").values

cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=SEED)


# Objective function
def objective(trial):
    # Core boosting configuration
    # NOTE: total number of trees; too many -> overfitting & longer training
    n_estimators = trial.suggest_int("n_estimators", 100, 2000, step=50)
    # NOTE: shrinkage of each tree's contribution; smaller -> slower but more accurate
    learning_rate = trial.suggest_float("learning_rate", 0.005, 0.3, log=True)
    # NOTE: maximum depth of individual trees; deeper -> more complex models
    max_depth = trial.suggest_int("max_depth", 2, 10)  # depth of each tree
    # NOTE: minimum sum of instance weight needed in a child; higher -> more
    # conservative splits
    min_child_weight = trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True)

    # Sub-sampling for variance reduction
    # NOTE: fraction of rows sampled per tree; lower -> prevents overfitting
    subsample = trial.suggest_float("subsample", 0.5, 1.0, step=0.1)
    # NOTE: fraction of features sampled per tree; lower -> more diverse trees
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0, step=0.1)

    # Regularisation
    # NOTE: minimum loss reduction to make a split; larger -> more conservative
    gamma = trial.suggest_float("gamma", 0.0, 5.0)  # minimum loss-reduction to split
    # NOTE: L2 regularization term on weights; larger -> prevents large weights
    reg_lambda = trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True)
    # NOTE: L1 regularization term on weights; larger -> induces sparsity
    reg_alpha = trial.suggest_float("reg_alpha", 0.0, 5.0)

    # Control for imbalanced data
    # NOTE: maximum delta step we allow each tree's weight estimation; helps with class
    # imbalance
    max_delta_step = trial.suggest_int("max_delta_step", 0, 10)

    # Tree growth algorithm
    # NOTE: 'depthwise' grows balanced trees; 'lossguide' focuses on high-loss regions
    grow_policy = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if grow_policy == "lossguide":
        max_depth = 0  # NOTE: depth limit disabled when optimizing by leaf count
        # NOTE: maximum number of leaves; more leaves -> more complex splits
        max_leaves = trial.suggest_int("max_leaves", 16, 256, step=16)
    else:
        max_leaves = 0  # NOTE: not used in 'depthwise' mode

    # Build the model
    xgb = XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        max_leaves=max_leaves,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        gamma=gamma,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        max_delta_step=max_delta_step,
        grow_policy=grow_policy,
        objective="reg:squarederror",  # regression objective minimizing squared error
        tree_method="hist",  # fast histogram grower
        random_state=SEED,
        n_jobs=-1,
    )

    # CV evaluation (neg RMSE -> minimise RMSE)
    rmse = -cross_val_score(
        xgb, X, y, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1
    ).mean()

    return rmse


# Create & run the Optuna study
study_xgb = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=SEED),
    pruner=HyperbandPruner(),  # stops weak configs early
    study_name="XGB_reproducible",
)

study_xgb.optimize(objective, n_trials=2000, show_progress_bar=True)
# study.optimize(objective, timeout=3600, show_progress_bar=True)

print("Best RMSE :", study_xgb.best_value)
print("Best parameters:", study_xgb.best_params)

[I 2025-05-19 23:09:18,156] A new study created in memory with name: XGB_reproducible


  0%|          | 0/2000 [00:00<?, ?it/s]

[I 2025-05-19 23:09:21,417] Trial 0 finished with value: 52.76386470116263 and parameters: {'n_estimators': 1150, 'learning_rate': 0.09889051819023266, 'max_depth': 2, 'min_child_weight': 0.020958259416674875, 'subsample': 0.7, 'colsample_bytree': 0.5, 'gamma': 0.5226509408623542, 'reg_lambda': 0.00245329663113718, 'reg_alpha': 1.2270165725677717, 'max_delta_step': 9, 'grow_policy': 'lossguide', 'max_leaves': 64}. Best is trial 0 with value: 52.76386470116263.
[I 2025-05-19 23:09:22,581] Trial 1 finished with value: 169.99430341423405 and parameters: {'n_estimators': 450, 'learning_rate': 0.01021685383394356, 'max_depth': 5, 'min_child_weight': 5.847260906636588, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 4.999068831521338, 'reg_lambda': 0.8040232616705578, 'reg_alpha': 2.421143372711623, 'max_delta_step': 1, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 52.76386470116263.
[I 2025-05-19 23:09:23,377] Trial 2 finished with value: 170.88783491739446 and parameters: {'

In [None]:
fig = plot_optimization_history(study_xgb)
fig.show()

In [12]:
print("Best RMSE :", study_xgb.best_value)

Best RMSE : 17.409483708876472


In [13]:
study_xgb.best_params

{'n_estimators': 1950,
 'learning_rate': 0.06101246351814552,
 'max_depth': 2,
 'min_child_weight': 0.0026112722871177462,
 'subsample': 0.5,
 'colsample_bytree': 0.5,
 'gamma': 0.10122997931140776,
 'reg_lambda': 0.0012464460745862215,
 'reg_alpha': 2.4725037721438725,
 'max_delta_step': 0,
 'grow_policy': 'depthwise'}

In [14]:
best_params = study_xgb.best_params.copy()
best_params.update(
    {
        "objective": "reg:squarederror",
        "tree_method": "hist",
        "random_state": SEED,
        "n_jobs": -1,
    }
)
xgb_final = XGBRegressor(**best_params)
xgb_final.fit(X, y)

## Stacking

In [20]:
y = train_df["target"].values
X = train_df.drop(columns="target").values

# Fixed base learners (best params from earlier Optuna runs)
rf_best = study_rf.best_params.copy()
cap_flag  = rf_best.pop("cap_depth")          # remove it; returns True/False
max_depth = rf_best.pop("max_depth", None)    # may or may not exist
rf_best["max_depth"] = max_depth if cap_flag else None
rf_best.update(dict(random_state=SEED, n_jobs=-1))
rf_base = RandomForestRegressor(**rf_best)

xgb_best = study_xgb.best_params.copy()
xgb_best.update(
    dict(objective="reg:squarederror", tree_method="hist", random_state=SEED, n_jobs=-1)
)
xgb_base = XGBRegressor(**xgb_best)

ols_base = Pipeline([("sc", StandardScaler()), ("ols", LinearRegression())])

base_estimators = [
    ("rf", rf_base),
    ("xgb", xgb_base),
    ("ols", ols_base),
]

# Cross-validation strategy
cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=SEED)


# Optuna objective – search only meta alpha
def objective(trial):
    alpha = trial.suggest_float("alpha", 1e-4, 1e2, log=True)

    meta = Pipeline(
        [("sc", StandardScaler()), ("ridge", Ridge(alpha=alpha, random_state=SEED))]
    )

    stack = StackingRegressor(
        estimators=base_estimators,
        final_estimator=meta,
        passthrough=False,
        n_jobs=-1,
    )

    rmse = -cross_val_score(
        stack, X, y, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1
    ).mean()

    return rmse



# Run the Optuna study
study_stack = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=SEED),
    pruner=HyperbandPruner(),
    study_name="stack_alpha",
)

study_stack.optimize(objective, n_trials=100, show_progress_bar=True)
# study_stack.optimize(objective, timeout=3600, show_progress_bar=True)

print("Best RMSE :", study_stack.best_value)
print("Best lambda (alpha):", study_stack.best_params["alpha"])

[I 2025-05-19 23:37:02,558] A new study created in memory with name: stack_alpha


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-05-19 23:37:21,193] Trial 0 finished with value: 3.012831200793709 and parameters: {'alpha': 0.18186390922332088}. Best is trial 0 with value: 3.012831200793709.
[I 2025-05-19 23:37:39,951] Trial 1 finished with value: 4.241783014790185 and parameters: {'alpha': 2.364309718481539}. Best is trial 0 with value: 3.012831200793709.
[I 2025-05-19 23:37:58,669] Trial 2 finished with value: 2.984459844033179 and parameters: {'alpha': 0.00012626665521941718}. Best is trial 2 with value: 2.984459844033179.
[I 2025-05-19 23:38:17,520] Trial 3 finished with value: 2.9849686373003417 and parameters: {'alpha': 0.009594731314082687}. Best is trial 2 with value: 2.984459844033179.
[I 2025-05-19 23:38:36,240] Trial 4 finished with value: 2.9854000418428974 and parameters: {'alpha': 0.016305380928534015}. Best is trial 2 with value: 2.984459844033179.
[I 2025-05-19 23:38:54,321] Trial 5 finished with value: 2.984463107789968 and parameters: {'alpha': 0.00019490455752180455}. Best is trial 2 wit

In [22]:
fig = plot_optimization_history(study_stack)
fig.show()

In [23]:
print("Best RMSE :", study_stack.best_value)

Best RMSE : 2.984458626683236


In [24]:
print("Best lambda (alpha):", study_stack.best_params["alpha"])

Best lambda (alpha): 0.00010063175752594249


In [25]:
best_alpha = study_stack.best_params["alpha"]

meta_final = Pipeline(
    [("sc", StandardScaler()), ("ridge", Ridge(alpha=best_alpha, random_state=SEED))]
)

stack_final = StackingRegressor(
    estimators=base_estimators,
    final_estimator=meta_final,
    passthrough=False,
    n_jobs=-1,
)

stack_final.fit(X, y)

## Save Best Params

In [None]:
# dataset_name = "regression"
# best_params = {
#     dataset_name: {
#         "bagging": {"rf": study_rf.best_params},
#         "boosting": {"xgb": study_xgb.best_params},
#         "stacking": {"stack": {"ridge_alpha": best_alpha}},
#     }
# }
# cfg_path = pathlib.Path(f"configs/{dataset_name}.yaml")
# cfg_path.write_text(yaml.safe_dump(best_params[dataset_name]))
# print(f"Best parameters for {dataset_name} successfully saved.")