# 04 — Basis Functions & Polynomial Features

**Goal:** Expand feature space with polynomial and interaction terms, and see over/under-fitting.


In [None]:
import warnings
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing, make_regression
from sklearn.model_selection import train_test_split

def load_regression_data(random_state=42):
    """Return (X, y, feature_names) as numpy arrays.
    Try California Housing; fallback to synthetic if unavailable (e.g., offline).
    """
    try:
        cali = fetch_california_housing(as_frame=True)
        df = cali.frame.copy()
        X = df.drop(columns=["MedHouseVal"]).values
        y = df["MedHouseVal"].values
        feature_names = list(df.drop(columns=["MedHouseVal"]).columns)
    except Exception as e:
        warnings.warn(f"California Housing fetch failed: {e}. Falling back to synthetic make_regression.")
        X, y = make_regression(n_samples=5000, n_features=8, n_informative=6, noise=8.5, random_state=random_state)
        feature_names = [f"x{i}" for i in range(X.shape[1])]
    return X, y, feature_names

def train_val_test_split(X, y, random_state=42):
    # 60/20/20 split: train/val/test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_state)
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

def r2(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    return float(1 - ss_res/ss_tot)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X, y, feature_names = load_regression_data()
(X_train, y_train), (X_val, y_val), (X_test, y_test) = train_val_test_split(X, y)

# Baseline linear
baseline = Pipeline([("scaler", StandardScaler()), ("lr", LinearRegression())])
baseline.fit(X_train, y_train)
base_rmse = np.sqrt(mean_squared_error(y_val, baseline.predict(X_val)))

# TODO: Create polynomial pipelines with degree=2..5; record RMSE on val; plot validation curve.
degrees = [1,2,3,4,5]
val_rmse = []
for d in degrees:
    pipe = Pipeline([
        ("poly", PolynomialFeatures(degree=d, include_bias=False)),
        ("scaler", StandardScaler(with_mean=False)),  # after poly, sparse-like; safe choice
        ("lr", LinearRegression())
    ])
    pipe.fit(X_train, y_train)
    rmse = np.sqrt(mean_squared_error(y_val, pipe.predict(X_val)))
    val_rmse.append(rmse)
    print(f"Degree {d} RMSE(val): {rmse:.4f}")

plt.figure()
plt.plot(degrees, val_rmse, marker="o")
plt.title("Validation curve: Polynomial degree vs RMSE")
plt.xlabel("degree")
plt.ylabel("RMSE")
print("Baseline linear RMSE(val):", base_rmse)
