# Notebook 8: Cross-Validation & Hyperparameter Tuning

Model performance on the training set is often misleading. To generalize well, we use **cross-validation** to select model complexity and prevent overfitting.

Goals:
- Understand train/test splits and cross-validation
- Use `GridSearchCV` to tune regularization strength
- Compare Ridge and Lasso performance
- Visualize error vs alpha curves


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

## Step 1: Generate Noisy Polynomial Dataset

In [None]:
np.random.seed(42)
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = 0.5 * X[:, 0]**3 - X[:, 0]**2 + 2 * X[:, 0] + 3 + np.random.normal(0, 20, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Step 2: Ridge Regression with GridSearchCV

In [None]:
alphas = np.logspace(-4, 4, 20)
ridge_pipeline = make_pipeline(PolynomialFeatures(degree=10), StandardScaler(), Ridge())

param_grid = {'ridge__alpha': alphas}
grid_ridge = GridSearchCV(ridge_pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_ridge.fit(X_train, y_train)
print(f"Best alpha for Ridge: {grid_ridge.best_params_['ridge__alpha']:.4f}")

In [None]:
results = pd.DataFrame(grid_ridge.cv_results_)
plt.figure(figsize=(10,5))
plt.plot(alphas, -results['mean_test_score'], marker='o')
plt.xscale('log')
plt.xlabel('alpha')
plt.ylabel('Cross-validated MSE')
plt.title('Ridge: Alpha vs CV Error')
plt.grid(True)
plt.show()

## Step 3: Lasso Regression with GridSearchCV

In [None]:
lasso_pipeline = make_pipeline(PolynomialFeatures(degree=10), StandardScaler(), Lasso(max_iter=10000))
param_grid = {'lasso__alpha': alphas}
grid_lasso = GridSearchCV(lasso_pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_lasso.fit(X_train, y_train)
print(f"Best alpha for Lasso: {grid_lasso.best_params_['lasso__alpha']:.4f}")

In [None]:
results = pd.DataFrame(grid_lasso.cv_results_)
plt.figure(figsize=(10,5))
plt.plot(alphas, -results['mean_test_score'], marker='o', color='darkred')
plt.xscale('log')
plt.xlabel('alpha')
plt.ylabel('Cross-validated MSE')
plt.title('Lasso: Alpha vs CV Error')
plt.grid(True)
plt.show()

## Step 4: Evaluate on Test Set

In [None]:
ridge_best = grid_ridge.best_estimator_
lasso_best = grid_lasso.best_estimator_

mse_ridge_test = mean_squared_error(y_test, ridge_best.predict(X_test))
mse_lasso_test = mean_squared_error(y_test, lasso_best.predict(X_test))

print(f"Ridge Test MSE: {mse_ridge_test:.2f}")
print(f"Lasso Test MSE: {mse_lasso_test:.2f}")