# 08 — Bias–Variance: Learning & Validation Curves

**Goal:** Diagnose under/overfitting and choose the right fix.


In [None]:
import warnings
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing, make_regression
from sklearn.model_selection import train_test_split

def load_regression_data(random_state=42):
    """Return (X, y, feature_names) as numpy arrays.
    Try California Housing; fallback to synthetic if unavailable (e.g., offline).
    """
    try:
        cali = fetch_california_housing(as_frame=True)
        df = cali.frame.copy()
        X = df.drop(columns=["MedHouseVal"]).values
        y = df["MedHouseVal"].values
        feature_names = list(df.drop(columns=["MedHouseVal"]).columns)
    except Exception as e:
        warnings.warn(f"California Housing fetch failed: {e}. Falling back to synthetic make_regression.")
        X, y = make_regression(n_samples=5000, n_features=8, n_informative=6, noise=8.5, random_state=random_state)
        feature_names = [f"x{i}" for i in range(X.shape[1])]
    return X, y, feature_names

def train_val_test_split(X, y, random_state=42):
    # 60/20/20 split: train/val/test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_state)
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

def r2(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    return float(1 - ss_res/ss_tot)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X, y, feature_names = load_regression_data()
(X_train, y_train), (X_val, y_val), (X_test, y_test) = train_val_test_split(X, y)

pipe = Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1.0))])

# Learning curve
train_sizes, train_scores, val_scores = learning_curve(
    pipe, X, y, train_sizes=np.linspace(0.1, 1.0, 6), cv=5,
    scoring="neg_root_mean_squared_error", n_jobs=-1, shuffle=True, random_state=42
)

train_rmse = -train_scores.mean(axis=1)
val_rmse = -val_scores.mean(axis=1)

plt.figure()
plt.plot(train_sizes, train_rmse, marker="o", label="train")
plt.plot(train_sizes, val_rmse, marker="o", label="val")
plt.title("Learning curve (Ridge)")
plt.xlabel("train size")
plt.ylabel("RMSE")
plt.legend()

# Validation curve for alpha
alphas = np.logspace(-3, 3, 13)
train_scores, val_scores = validation_curve(
    Pipeline([("scaler", StandardScaler()), ("model", Ridge())]),
    X, y, param_name="model__alpha", param_range=alphas,
    cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1
)

plt.figure()
plt.semilogx(alphas, -train_scores.mean(axis=1), marker="o", label="train")
plt.semilogx(alphas, -val_scores.mean(axis=1), marker="o", label="val")
plt.title("Validation curve (Ridge alpha)")
plt.xlabel("alpha"); plt.ylabel("RMSE"); plt.legend()


In [None]:
# TODO: In 3-5 sentences: identify under/overfitting regions and the fix (more data vs simpler model vs stronger regularization).
