# Hyperparameter Tuning with Optuna

This notebook performs hyperparameter optimization for the LightGBM model using Optuna.
We optimize all relevant hyperparameters to find the best configuration for credit risk prediction.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMPruningCallback
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    log_loss,
    brier_score_loss,
)
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

from credit_risk_xai.config import FEATURE_CACHE_PATH
from credit_risk_xai.features.engineer import prepare_modeling_data
from credit_risk_xai.modeling.utils import split_train_validation

## Load and Prepare Data

In [None]:
# Load and filter data (same filters as in 05a notebook)
df = pd.read_parquet(FEATURE_CACHE_PATH)
df = df[
    (df["ser_aktiv"] == 1) & 
    (df["sme_category"].isin(["Small", "Medium"])) & 
    (df["knc_kncfall"] == 1) &
    (df["bransch_borsbransch_konv"] != "40.0")
]

X, y = prepare_modeling_data(df)
print(f"Dataset shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts(normalize=True)}")

In [None]:
# Split into train and validation (same split as training)
X_train, X_val, y_train, y_val = split_train_validation(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Training positive rate: {y_train.mean():.4f}")
print(f"Validation positive rate: {y_val.mean():.4f}")

## Define Optuna Objective Function

We optimize all relevant LightGBM hyperparameters:
- **Tree structure**: `num_leaves`, `max_depth`, `min_child_samples`, `min_child_weight`
- **Regularization**: `reg_alpha` (L1), `reg_lambda` (L2), `min_split_gain`
- **Sampling**: `subsample` (bagging), `colsample_bytree` (feature fraction), `subsample_freq`
- **Learning**: `learning_rate`, `n_estimators` (with early stopping)

In [None]:
def objective(trial: optuna.Trial) -> float:
    """
    Optuna objective function for LightGBM hyperparameter optimization.
    Uses cross-validation for robust evaluation. Minimizes log loss.
    """
    # Hyperparameter search space
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "random_state": 42,
        "n_jobs": -1,
        
        # Learning rate and boosting rounds
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": 10_000,  # Use early stopping
        
        # Tree structure
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
        
        # Regularization
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
        
        # Sampling
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 0, 10),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        
        # Class imbalance handling
        "is_unbalance": False,
    }
    
    
    # Cross-validation with stratified folds
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_fold_train = X_train.iloc[train_idx]
        y_fold_train = y_train.iloc[train_idx]
        X_fold_val = X_train.iloc[val_idx]
        y_fold_val = y_train.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        
        # Pruning callback using log loss (aligned with study direction=minimize)
        pruning_callback = LightGBMPruningCallback(trial, "binary_logloss")
        
        model.fit(
            X_fold_train,
            y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50, verbose=False),
                pruning_callback,
            ],
        )
        
        # Evaluate on fold validation set
        y_pred_proba = model.predict_proba(X_fold_val)[:, 1]
        fold_score = log_loss(y_fold_val, y_pred_proba)
        cv_scores.append(fold_score)
    
    return np.mean(cv_scores)

## Run Hyperparameter Optimization

In [None]:
# Create Optuna study
study = optuna.create_study(
    direction="minimize",  # Minimize log loss
    study_name="lightgbm_credit_risk",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=20),
)

# Run optimization
study.optimize(
    objective,
    n_trials=100,  # Adjust based on available compute time
    show_progress_bar=True,
    n_jobs=1,  # Sequential trials (parallelism handled within each trial)
)

In [None]:
# Display best results
print("Best trial:")
print(f"  Value (Log Loss): {study.best_trial.value:.4f}")
print("\nBest hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

# Quick AUC check with best params
best_params_check = {
    "objective": "binary",
    "metric": "binary_logloss",
    "verbosity": -1,
    "random_state": 42,
    "n_jobs": -1,
    "n_estimators": 10_000,
    **study.best_trial.params,
}

model_check = lgb.LGBMClassifier(**best_params_check)
model_check.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
)

y_pred_check = model_check.predict_proba(X_val)[:, 1]
print(f"\nValidation metrics with best params:")
print(f"  ROC-AUC: {roc_auc_score(y_val, y_pred_check):.4f}")
print(f"  Log Loss: {log_loss(y_val, y_pred_check):.4f}")
print(f"  Best iteration: {model_check.best_iteration_}")

## Visualize Optimization Results

In [None]:
# Optimization history
import plotly
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [None]:
# Parameter importances
fig = optuna.visualization.plot_param_importances(study)
fig.show()

In [None]:
# Parallel coordinate plot
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.show()

In [None]:
# Slice plot for key parameters
fig = optuna.visualization.plot_slice(
    study, 
    params=["learning_rate", "num_leaves", "max_depth", "reg_alpha", "reg_lambda"]
)
fig.show()

## Train Final Model with Best Parameters

In [None]:
# Construct best parameters dict
best_params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "verbosity": -1,
    "random_state": 42,
    "n_jobs": -1,
    "n_estimators": 10_000,
    **study.best_trial.params,
}

print("Best parameters for final model:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

In [None]:
# Train final model on full training set
final_model = lgb.LGBMClassifier(**best_params)

final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        lgb.log_evaluation(period=100),
    ],
)

print(f"\nBest iteration: {final_model.best_iteration_}")

In [None]:
# Evaluate on held-out validation set
y_pred_proba = final_model.predict_proba(X_val)[:, 1]

metrics = {
    "ROC-AUC": roc_auc_score(y_val, y_pred_proba),
    "PR-AUC": average_precision_score(y_val, y_pred_proba),
    "Log Loss": log_loss(y_val, y_pred_proba),
    "Brier Score": brier_score_loss(y_val, y_pred_proba),
}

print("\nFinal Model Validation Metrics:")
for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")

## Export Best Parameters

In [None]:
# Save best parameters for use in other notebooks
import json
from credit_risk_xai.config import PROJ_ROOT

output_path = PROJ_ROOT / "models" / "best_lgbm_params.json"
output_path.parent.mkdir(parents=True, exist_ok=True)

# Convert to serializable format
params_to_save = {k: v for k, v in best_params.items() if k not in ["verbosity", "n_jobs"]}

with open(output_path, "w") as f:
    json.dump(params_to_save, f, indent=2)

print(f"Best parameters saved to: {output_path}")

In [None]:
# Print parameters in a format ready to copy into code
print("\n# Copy these parameters to use in run_lightgbm_training():")
print("params = {")
for k, v in study.best_trial.params.items():
    if isinstance(v, str):
        print(f'    "{k}": "{v}",')
    elif isinstance(v, bool):
        print(f'    "{k}": {v},')
    elif isinstance(v, float):
        print(f'    "{k}": {v:.6g},')
    else:
        print(f'    "{k}": {v},')
print("}")

## Compare with Default Parameters

In [None]:
# Train model with default parameters for comparison
default_params = {
    "objective": "binary",
    "n_estimators": 10_000,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42,
    "n_jobs": -1,
    "verbosity": -1,
    "is_unbalance": False,
    "metric": "binary_logloss",
}

default_model = lgb.LGBMClassifier(**default_params)
default_model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=False),
    ],
)

y_pred_default = default_model.predict_proba(X_val)[:, 1]

default_metrics = {
    "ROC-AUC": roc_auc_score(y_val, y_pred_default),
    "PR-AUC": average_precision_score(y_val, y_pred_default),
    "Log Loss": log_loss(y_val, y_pred_default),
    "Brier Score": brier_score_loss(y_val, y_pred_default),
}

In [None]:
# Comparison table
comparison = pd.DataFrame({
    "Default": default_metrics,
    "Optimized": metrics,
}).T

comparison["Δ ROC-AUC"] = comparison["ROC-AUC"] - comparison.loc["Default", "ROC-AUC"]
comparison["Δ Log Loss"] = comparison["Log Loss"] - comparison.loc["Default", "Log Loss"]

print("\nComparison: Default vs Optimized Parameters")
print("="*60)
print(comparison.round(4).to_string())

## Summary

The Optuna optimization searched over:
- **Learning rate**: 0.01 - 0.3 (log scale)
- **Tree structure**: num_leaves (8-256), max_depth (3-12), min_child_samples (5-100)
- **Regularization**: L1 (reg_alpha), L2 (reg_lambda), min_split_gain
- **Sampling**: subsample (0.5-1.0), colsample_bytree (0.5-1.0)
- **Class imbalance**: is_unbalance or scale_pos_weight

The best parameters have been saved to `models/best_lgbm_params.json` for use in other notebooks.