In [8]:
!python -V

Python 3.12.6


In [9]:
from __future__ import annotations
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from typing import Dict, Any, Tuple
import numpy as np
import pandas as pd

### You can add some model here and run all cells again:

In [10]:
MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
    "linear": {
        "version": "v0.1",
        "estimator": LinearRegression,
        "estimator_kwargs": {},
        "description": "StandardScaler + LinearRegression",
    },
    "ridge_base": {  # NEW MODEL: Ridge without CV
        "version": "v0.2",
        "estimator": Ridge,
        "estimator_kwargs": {"alpha": 1.0}, # Default alpha=1.0 from the original notebook
        "description": "StandardScaler + Ridge(alpha=1.0)",
    },
    "ridge_cv": {
        "version": "v0.2",
        "estimator": GridSearchCV,
        "estimator_kwargs": {
            "estimator": Ridge(),
            "param_grid": {"alpha": [0.01, 0.1, 1.0, 10.0, 100.0]},
            "cv": 3,
            "scoring": "neg_mean_squared_error",
            "n_jobs": -1,
        },
        "description": "StandardScaler + Ridge with GridSearchCV",
    },
    "random_forest": {
        "version": "v0.3",
        "estimator": RandomForestRegressor,
        "estimator_kwargs": {"n_estimators": 100, "random_state": 42},
        "description": "StandardScaler + RandomForestRegressor(n_estimators=100)",
    },
    "rf_grid_cv": {
        "version": "v0.4",
        "estimator": GridSearchCV,
        "estimator_kwargs": {
            "estimator": RandomForestRegressor(random_state=42),
            "param_grid": {
                "n_estimators": [50, 100, 200],
                "max_depth": [None, 5, 10, 20],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4]
            },
            "cv": 3,
            "scoring": "neg_mean_squared_error",
            "n_jobs": -1,
        },
        "description": "StandardScaler + RandomForestRegressor with GridSearchCV",
    },
    "rf_random_cv": {
        "version": "v0.5",
        "estimator": RandomizedSearchCV,
        "estimator_kwargs": {
            "estimator": RandomForestRegressor(random_state=42),
            "param_distributions": {
                "n_estimators": [50, 100, 200],
                "max_depth": [None, 5, 10, 20],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4]
            },
            "n_iter": 10,
            "cv": 3,
            "scoring": "neg_mean_squared_error",
            "n_jobs": -1,
            "random_state": 42
        },
        "description": "StandardScaler + RandomForestRegressor with RandomizedSearchCV",
    },
}

In [11]:
# Load dataset
Xy = load_diabetes(as_frame=True)
X = Xy.frame.drop(columns=["target"])
y = Xy.frame["target"]

# Deterministic split
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# Scale (fit on train, transform both)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
def train_and_evaluate_model(
    model_key: str,
    X_train_scaled: np.ndarray,
    y_train: np.ndarray,
    X_test_scaled: np.ndarray,
    y_test: np.ndarray,
) -> Tuple[str, float, float, float]:
    """Trains and evaluates a model from the MODEL_REGISTRY."""
    config = MODEL_REGISTRY[model_key]
    
    estimator_cls = config["estimator"]
    estimator_kwargs = config.get("estimator_kwargs", {})
    
    # Initialize and train the model
    model = estimator_cls(**estimator_kwargs)
    model.fit(X_train_scaled, y_train)
    
    # Predict
    preds = model.predict(X_test_scaled)
    
    # Evaluate
    rmse = float(np.sqrt(mean_squared_error(y_test, preds)))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    
    return config['description'], rmse, mae, r2

In [13]:
results = []
for model_name in MODEL_REGISTRY:
    desc, rmse, mae, r2 = train_and_evaluate_model(
        model_name,
        X_train_scaled,
        y_train,
        X_test_scaled,
        y_test,
    )
    results.append({'Model': desc, 'RMSE': rmse, 'MAE': mae, 'R2': r2})

results_df = pd.DataFrame(results)
results_df['RMSE'] = results_df['RMSE'].round(4)
results_df['MAE'] = results_df['MAE'].round(4)
results_df['R2'] = results_df['R2'].round(4)

# Print results in a markdown table format
print(results_df.to_markdown(index=False, numalign="left", stralign="left"))

| Model                                                          | RMSE    | MAE     | R2     |
|:---------------------------------------------------------------|:--------|:--------|:-------|
| StandardScaler + LinearRegression                              | 53.8534 | 42.7941 | 0.4526 |
| StandardScaler + Ridge(alpha=1.0)                              | 53.7775 | 42.812  | 0.4541 |
| StandardScaler + Ridge with GridSearchCV                       | 53.8429 | 42.7962 | 0.4528 |
| StandardScaler + RandomForestRegressor(n_estimators=100)       | 54.3984 | 44.1712 | 0.4415 |
| StandardScaler + RandomForestRegressor with GridSearchCV       | 53.7869 | 43.6515 | 0.454  |
| StandardScaler + RandomForestRegressor with RandomizedSearchCV | 53.9302 | 43.7319 | 0.451  |
