# Model selection + Hyperparameter tuning

In practice typically combined with:
* **clean train/validation/test** discipline (or nested CV - see below),
* **pipelines** (preprocessing + model in one unit, cross-validated end-to-end),
* **cross-validation** with multiple metrics,
* a **final hold-out test** to report unbiased performance.

## Outline (playbook)
1. **Frame the problem & metric(s)** (eg. Classification? Use ROC AUC (ranking), F1 (imbalance), Accuracy (sanity), LogLoss (probability quality))
2. **Split once → train / test** (keep test untouched until the very end).
3. **Build a Pipeline** = preprocessing (e.g., imputation, scaling) + model.
4. Define **candidate models + param grids**.
5. **Cross-validated grid search per model**; collect CV metrics.
6. **Compare models on CV**; pick the best (by primary metric).
7. Refit winner on full train; evaluate on test; save artifacts.
8. (Optional) Calibrate probabilities, threshold tuning for business tradeoffs.

In [4]:
# ======================================
# Model Selection + Hyperparameter Tuning (Classification)
# ======================================

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, f1_score, accuracy_score, log_loss, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# --------------------------
# 0) Data: use a clean built-in classification dataset
# --------------------------
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Optional: inspect
# print(X.head(), y.value_counts())

# Train/test split (hold out test for honest final evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# --------------------------
# 1) Preprocessing
# --------------------------
# This dataset is all numeric; we still show a ColumnTransformer for real-world readiness.
num_features = X_train.columns.tolist()
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features)
    ],
    remainder="drop"
)

# --------------------------
# 2) Candidate models and parameter grids
# --------------------------
models_and_grids = [
    (
        "logreg",
        LogisticRegression(max_iter=1000, solver="lbfgs"),
        {
            "model__C": [0.01, 0.1, 1, 10],
            "model__penalty": ["l2"],
            "model__class_weight": [None, "balanced"]
        }
    ),
    (
        "rf",
        RandomForestClassifier(random_state=42),
        {
            "model__n_estimators": [200, 500],
            "model__max_depth": [None, 5, 10, 20],
            "model__min_samples_leaf": [1, 2, 5],
            "model__class_weight": [None, "balanced"]
        }
    ),
    (
        "gb",
        GradientBoostingClassifier(random_state=42),
        {
            "model__n_estimators": [200, 500],
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [2, 3],
            "model__min_samples_leaf": [1, 2, 5]
        }
    ),
]

# --------------------------
# 3) Cross-validation setup & scoring
# --------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Primary metric: ROC AUC; also collect others for context
scoring = {
    "roc_auc": "roc_auc",
    "f1": "f1",
    "accuracy": "accuracy",
    "neg_log_loss": "neg_log_loss"
}

# --------------------------
# 4) Run GridSearchCV for each model
# --------------------------
comparison_rows = []
best_estimators = {}

for name, estimator, param_grid in models_and_grids:
    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", estimator)
    ])

    grid = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        scoring=scoring,
        refit="roc_auc",            # refit the best params per model using ROC AUC
        cv=cv,
        n_jobs=-1,
        verbose=0,
        return_train_score=False
    )
    grid.fit(X_train, y_train)

    # Store the refit best estimator & metrics
    best_estimators[name] = grid.best_estimator_

    # Extract cross-validated means for our metrics
    mean_roc_auc   = grid.cv_results_["mean_test_roc_auc"][grid.best_index_]
    mean_f1        = grid.cv_results_["mean_test_f1"][grid.best_index_]
    mean_acc       = grid.cv_results_["mean_test_accuracy"][grid.best_index_]
    mean_log_loss  = -grid.cv_results_["mean_test_neg_log_loss"][grid.best_index_]  # flip sign

    comparison_rows.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "CV ROC AUC": round(mean_roc_auc, 4),
        "CV F1": round(mean_f1, 4),
        "CV Accuracy": round(mean_acc, 4),
        "CV LogLoss": round(mean_log_loss, 4)
    })

# --------------------------
# 5) Compare models by CV ROC AUC (primary)
# --------------------------
comparison_df = pd.DataFrame(comparison_rows).sort_values(by="CV ROC AUC", ascending=False)
print("\n=== Cross-validated Model Comparison (sorted by ROC AUC) ===")
print(comparison_df.to_string(index=False))

# Pick the winner
winner_name = comparison_df.iloc[0]["Model"]
winner = best_estimators[winner_name]
print(f"\nSelected winner by CV ROC AUC: {winner_name}")

# --------------------------
# 6) Final evaluation on the untouched test set
# --------------------------
winner.fit(X_train, y_train)  # refit on full training data
proba_test = winner.predict_proba(X_test)[:, 1]
pred_test  = (proba_test >= 0.5).astype(int)  # simple 0.5 threshold; tune if needed

final_metrics = {
    "Test ROC AUC": roc_auc_score(y_test, proba_test),
    "Test F1": f1_score(y_test, pred_test),
    "Test Accuracy": accuracy_score(y_test, pred_test),
    "Test LogLoss": log_loss(y_test, proba_test)
}
print("\n=== Final Test Metrics (Winner) ===")
for k, v in final_metrics.items():
    print(f"{k}: {v:.4f}")

print("\n=== Classification Report (Winner @0.5 threshold) ===")
print(classification_report(y_test, pred_test, digits=4))


=== Cross-validated Model Comparison (sorted by ROC AUC) ===
 Model                                                                                                             Best Params  CV ROC AUC  CV F1  CV Accuracy  CV LogLoss
logreg                                                    {'model__C': 1, 'model__class_weight': None, 'model__penalty': 'l2'}      0.9959 0.9825       0.9780      0.0723
    gb          {'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__min_samples_leaf': 5, 'model__n_estimators': 200}      0.9948 0.9789       0.9736      0.1090
    rf {'model__class_weight': 'balanced', 'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__n_estimators': 200}      0.9924 0.9700       0.9626      0.1185

Selected winner by CV ROC AUC: logreg

=== Final Test Metrics (Winner) ===
Test ROC AUC: 0.9954
Test F1: 0.9861
Test Accuracy: 0.9825
Test LogLoss: 0.0777

=== Classification Report (Winner @0.5 threshold) ===
              precision    recall  f1-score

# A note about the order and avoiding data leakage between train & test
Test set must match the structure of the training data (e.g., same scaling, same encoding). But if you fit your imputer/scaler/encoder *before splitting*, you’ve already used information from the test set during preprocessing. **That’s leakage**.

**Example**:
* Suppose you scale features with StandardScaler. If you compute the mean and std on the whole dataset before splitting, the scaler has already seen the test distribution.
* Then your test evaluation is no longer a true “unseen” evaluation.

**Therefor use the Correct Order:**
1. Split once into train and test.
   * The test set is put aside and not touched until the end.
2.	Build a pipeline that contains:
   * Imputation, scaling, encoding, feature selection, etc.
   * Followed by the model.
3.	Fit the pipeline only on training data.
   * The scaler, imputer, encoder all “learn” their parameters (e.g., mean, std, category mapping) only from the training set.
4.	Apply the trained pipeline to the test set.
   * The pipeline reuses the learned transformations on the test data.


## Example Code

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_breast_cancer

# Load data
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Split FIRST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build preprocessing
num_features = X_train.columns
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_features)
])

# Build pipeline = preprocessing + model
pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

# Fit on train only
pipe.fit(X_train, y_train)

# Evaluate on test
test_score = pipe.score(X_test, y_test)

# The Problem with “Normal” Cross-Validation + GridSearch

If you want the most rigorous selection, wrap this in nested CV (outer CV for selection, inner CV for tuning).

**When you run GridSearchCV (or RandomizedSearchCV):**
* You split data into folds.
* For each candidate hyperparameter set, you train on (k–1) folds and validate on the held-out fold.
* You pick the hyperparameters with the best mean CV score.

⚠️ **But here’s the catch:**
* Because you use the same CV both for hyperparameter tuning and for estimating performance, your CV score is a slightly optimistic estimate (it has “peeked” at validation).

## The Nested CV Solution

**Nested CV introduces two layers of cross-validation**:
1. **Outer loop (evaluation loop)**:
    * Split the dataset into outer folds.
    * Each outer fold acts as a test set once.
2. **Inner loop (tuning loop)**:
  * Inside each training split from the outer loop, run GridSearchCV/RandomizedSearchCV to select the best hyperparameters.
  * Refit the best model on that inner training data.
3. **Evaluate**:
* Test the tuned model on the outer test fold.
* Collect the scores across all outer folds → unbiased estimate of true performance.

**Why It Matters**
* Normal CV = good enough when you just want to pick a model.
* Nested CV = necessary if you want an unbiased estimate of generalization error after tuning.

That’s why in research papers and benchmarking, nested CV is the standard.

**ℹ️ In business practice, many teams skip nested CV because it’s expensive, and instead:**
* Keep a hold-out test set (like I showed earlier), or
* Use cross-validation for tuning, then evaluate once on the untouched test set.

### Simple Example Sketch

In [7]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
import warnings
from sklearn.exceptions import ConvergenceWarning

# (Optional) silence only LR convergence warnings during CV
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Data
X, y = load_breast_cancer(return_X_y=True)

# Pipeline: scale -> logistic regression
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=5000))  # higher ceiling helps convergence
])

# Hyperparameter grid (legal combos only)
param_grid = [
    {   # lbfgs (fast, stable for L2)
        "lr__solver": ["lbfgs"],
        "lr__penalty": ["l2"],
        "lr__C": np.logspace(-3, 3, 7),   # 0.001 ... 1000
    },
    {   # liblinear (good for small/binary; supports L2 here to keep parity)
        "lr__solver": ["liblinear"],
        "lr__penalty": ["l2"],
        "lr__C": np.logspace(-3, 3, 7),
    },
    {   # saga (scalable; also supports l1/elasticnet if you want to extend)
        "lr__solver": ["saga"],
        "lr__penalty": ["l2"],
        "lr__C": np.logspace(-3, 3, 7),
    },
]

# Inner loop (tuning)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=inner_cv,
    scoring="roc_auc",
    n_jobs=-1,
    refit=True,
)

# Outer loop (unbiased evaluation)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
nested_scores = cross_val_score(grid, X, y, cv=outer_cv, scoring="roc_auc", n_jobs=-1)

print("Nested CV ROC AUC scores:", nested_scores)
print("Mean performance (unbiased):", np.mean(nested_scores))
print("Std:", np.std(nested_scores))

Nested CV ROC AUC scores: [0.99836227 0.99705208 0.98511905 0.99966931 0.99865862]
Mean performance (unbiased): 0.9957722649419342
Std: 0.005391754587558533
