# 06_hpo_all_models.ipynb

This notebook summarizes the hyperparameter optimization (HPO) for three gradient boosting models used in the Home Credit Default Risk project:

- LightGBM (with Hyperband)
- XGBoost
- CatBoost

All models are optimized using 5-fold stratified cross-validation and AUC as the evaluation metric. The HPO logic is standardized via a shared utility function (`run_oof_cv`) to ensure consistency and comparability.

Final parameters will be used in separate training and ensembling steps.

> Note: HPO was initially run in separate legacy notebooks for speed and testing. This consolidated version is intended for clarity and reproducibility in the GitHub project. The logs of those can be found in the directory `logs/hpo/` and start with the name **legacy...**
---

First LightGBM HPO:

In [None]:
import os, sys, logging, pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
from sklearn.model_selection import StratifiedKFold

from lightgbm import early_stopping, log_evaluation

from creditutils.path_utils import get_project_root

# Get the root directory of the project
proj_root = get_project_root()

from creditutils.hpo_utils import run_oof_cv

# Set up logging
log_path = os.path.join(proj_root, "logs", "hpo", "lightgbm_hyperband.log")
os.makedirs(os.path.dirname(log_path), exist_ok=True)
logging.basicConfig(
    filename=log_path, filemode='a',
    level=logging.INFO,
    format='%(asctime)s  %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logging.info("=== New Optuna session started (LightGBM + Hyperband) ===")

# Load dataset and prepare features
df = pd.read_csv(os.path.join(proj_root, "outputs", "03_train_features_autosearch_baseline.csv"))
y = df['TARGET']
X = df.drop(columns=['SK_ID_CURR','TARGET'])

# Convert categorical columns to category dtype for LightGBM
for c in X.select_dtypes(include=['object']).columns:
    X[c] = X[c].astype('category')

# 5-fold stratified CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the Optuna objective function
def objective(trial):
    # Hyperparameter search space
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'data_sample_strategy': 'goss',
        'learning_rate': trial.suggest_float('learning_rate', 0.0030, 0.0055, log=True),
        'max_depth': trial.suggest_int('max_depth', 9, 13),
        'num_leaves': trial.suggest_int('num_leaves', 98, 160),
        'min_child_samples': trial.suggest_int('min_child_samples', 165, 195),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.005, 0.05, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 0.02, log=True),
        'subsample': trial.suggest_float('subsample', 0.75, 0.91),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.40, 0.51),
        'n_estimators': trial.suggest_int('n_estimators', 4000, 6000),
        'class_weight': 'balanced',
        'random_state': 42,
        'verbosity': -1
    }

    # Initialize the model
    model = lgb.LGBMClassifier(**params)

    # Fit parameters with early stopping and no logging to stdout
    fit_params = {
        'eval_metric': 'auc',
        'callbacks': [early_stopping(100), log_evaluation(0)]
    }

    # Run cross-validation and return the average validation score
    avg = run_oof_cv(model, X, y, cv, trial, log_prefix="LGBM", fit_params=fit_params)

    # Log the trial results
    logging.info(f"LGBM Trial {trial.number} complete | Params: {params}\n")
    return avg

# Set up Optuna study with TPE sampler and Hyperband pruner
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    pruner=HyperbandPruner(min_resource=1, max_resource=5, reduction_factor=3)
)

# Start optimization
study.optimize(objective, n_trials=200, catch=(Exception,))

# Print best results
print("Best trial parameters:", study.best_trial.params)
print("Best CV AUC:", study.best_value)

ModuleNotFoundError: No module named 'src'

Next XGBoost HPO (with MedianPruner instead of Hyperband due to much longer training time)

In [None]:
import os, sys, logging, pandas as pd, numpy as np, xgboost as xgb, optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from sklearn.model_selection import StratifiedKFold

from creditutils.path_utils import get_project_root

# Get project root directory
proj_root = get_project_root()

from creditutils.hpo_utils import run_oof_cv

# Set up logging
log_path = os.path.join(proj_root, "logs", "hpo", "xgboost_median.log")
os.makedirs(os.path.dirname(log_path), exist_ok=True)
logging.basicConfig(
    filename=log_path, filemode='a',
    level=logging.INFO,
    format='%(asctime)s  %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logging.info("=== New Optuna session started (XGBoost + MedianPruner) ===")

# Load dataset and prepare features
df = pd.read_csv(os.path.join(proj_root, "outputs", "03_train_features_autosearch_baseline.csv"))
y = df['TARGET']
X = df.drop(columns=['SK_ID_CURR','TARGET'])

# Convert object columns to categorical
for c in X.select_dtypes(include=['object']).columns:
    X[c] = X[c].astype('category')

# Compute class imbalance ratio
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos

# 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the Optuna objective function
def objective(trial):
    # Define hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 800, 1200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
        'gamma': trial.suggest_float('gamma', 2.5, 4.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 2.5, 4.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1.5, 3.0),
        'scale_pos_weight': scale_pos_weight,  # handle class imbalance
        "enable_categorical": True,
        'use_label_encoder': False,
        'tree_method': 'hist',  # fast histogram-based training
        'eval_metric': 'auc',
        'early_stopping_rounds': 75,
        'random_state': 42,
        'n_jobs': 3
    }

    # Create XGBoost model
    model = xgb.XGBClassifier(**params)

    # Training configuration
    fit_params = {
        'verbose': False
    }

    # Run CV and return average score
    avg = run_oof_cv(model, X, y, cv, trial, log_prefix="XGB", fit_params=fit_params)
    logging.info(f"XGB Trial {trial.number} complete | Params: {params}\n")
    return avg

# Set up Optuna study with TPE sampler and Median pruner
study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(seed=42),
    pruner=MedianPruner(n_warmup_steps=2)
)

# Start optimization
study.optimize(objective, n_trials=60, catch=(Exception,))

# Output best results
print("Best trial parameters:", study.best_trial.params)
print("Best CV AUC:", study.best_value)

Next Catboost HPO (with MedianPruner instead of Hyperband due to much longer training time)

In [None]:
import os, sys, logging, pandas as pd, optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

from creditutils.path_utils import get_project_root

# Get project root directory
proj_root = get_project_root()

from creditutils.hpo_utils import run_oof_cv

# Set up logging
log_path = os.path.join(proj_root, "logs", "hpo", "catboost_median.log")
os.makedirs(os.path.dirname(log_path), exist_ok=True)
logging.basicConfig(
    filename=log_path, filemode='a',
    level=logging.INFO,
    format='%(asctime)s  %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logging.info("=== New Optuna session started (CatBoost + MedianPruner) ===")

# Load dataset and prepare features
df = pd.read_csv(os.path.join(proj_root, "outputs", "03_train_features_autosearch_baseline.csv"))
y = df['TARGET']
X = df.drop(columns=['SK_ID_CURR','TARGET'])

# Identify categorical columns
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
for c in cat_cols:
    X[c] = X[c].astype('category')

# 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the Optuna objective function
def objective(trial):
    # Define hyperparameter search space
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.07, log=True),
        'depth': trial.suggest_int('depth', 5, 7),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 4.5, 8.5),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.3, 0.8),
        'border_count': trial.suggest_int('border_count', 50, 130),
        'auto_class_weights': 'Balanced',  # handle class imbalance
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': 0,
        'task_type': 'CPU',
        'early_stopping_rounds': 75,
        'cat_features': cat_cols
    }

    # Create CatBoost model
    model = CatBoostClassifier(**params)

    # Use best model according to early stopping
    fit_params = {'use_best_model': True}

    # Run cross-validation and return average score
    avg = run_oof_cv(model, X, y, cv, trial, log_prefix="Cat", fit_params=fit_params)

    # Log results
    logging.info(f"Cat Trial {trial.number} complete | Params: {params}\n")
    return avg

# Set up Optuna study with TPE sampler and Median pruner
study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(seed=42),
    pruner=MedianPruner(n_warmup_steps=2)
)

# Start optimization
study.optimize(objective, n_trials=60, catch=(Exception,))

# Output best results
print("Best trial parameters:", study.best_trial.params)
print("Best CV AUC:", study.best_value)

[I 2025-07-12 00:53:02,370] A new study created in memory with name: no-name-75ba9bab-0320-44df-873d-0d9fb307651e
[W 2025-07-12 00:56:51,048] Trial 0 failed with parameters: {'iterations': 1187, 'learning_rate': 0.06713701822915177, 'depth': 7, 'l2_leaf_reg': 6.894633936788146, 'bagging_temperature': 0.37800932022121825, 'border_count': 62} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\tgruenecker\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\tgruenecker\AppData\Local\Temp\ipykernel_6152\1442551076.py", line 51, in objective
    avg = run_oof_cv(model, X, y, cv, trial, log_prefix="Cat", fit_params=fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tgruenecker\OneDrive\Desktop\Master_Studium\3. Semester\Home_Credit_Pr

KeyboardInterrupt: 