# 03 — Gradient Descent (GD) vs Stochastic Gradient Descent (SGD)

**Goal:** Implement GD and mini-batch SGD for OLS, explore learning rates and convergence.


In [None]:
import warnings
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing, make_regression
from sklearn.model_selection import train_test_split

def load_regression_data(random_state=42):
    """Return (X, y, feature_names) as numpy arrays.
    Try California Housing; fallback to synthetic if unavailable (e.g., offline).
    """
    try:
        cali = fetch_california_housing(as_frame=True)
        df = cali.frame.copy()
        X = df.drop(columns=["MedHouseVal"]).values
        y = df["MedHouseVal"].values
        feature_names = list(df.drop(columns=["MedHouseVal"]).columns)
    except Exception as e:
        warnings.warn(f"California Housing fetch failed: {e}. Falling back to synthetic make_regression.")
        X, y = make_regression(n_samples=5000, n_features=8, n_informative=6, noise=8.5, random_state=random_state)
        feature_names = [f"x{i}" for i in range(X.shape[1])]
    return X, y, feature_names

def train_val_test_split(X, y, random_state=42):
    # 60/20/20 split: train/val/test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_state)
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

def r2(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    return float(1 - ss_res/ss_tot)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

X, y, feature_names = load_regression_data()
(X_train, y_train), (X_val, y_val), (X_test, y_test) = train_val_test_split(X, y)

# Add intercept
Xtr_i = np.c_[np.ones((X_train.shape[0], 1)), X_train]
Xval_i = np.c_[np.ones((X_val.shape[0], 1)), X_val]

def mse_grad(Xi, y, beta):
    n = Xi.shape[0]
    return (2.0/n) * (Xi.T @ (Xi @ beta - y))

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

# TODO: Implement GD
def gd(Xi, y, lr=1e-2, iters=1000, beta0=None):
    n, d = Xi.shape
    beta = np.zeros(d) if beta0 is None else beta0.copy()
    losses = []
    for t in range(iters):
        grad = mse_grad(Xi, y, beta)
        beta -= lr * grad
        losses.append(np.mean((Xi @ beta - y)**2))
    return beta, np.array(losses)

# TODO: Implement mini-batch SGD
def sgd(Xi, y, lr=1e-2, iters=2000, batch_size=64, beta0=None, seed=42):
    rng = np.random.default_rng(seed)
    n, d = Xi.shape
    beta = np.zeros(d) if beta0 is None else beta0.copy()
    losses = []
    for t in range(iters):
        idx = rng.choice(n, size=batch_size, replace=False)
        Xi_b = Xi[idx]; y_b = y[idx]
        grad = (2.0/batch_size) * (Xi_b.T @ (Xi_b @ beta - y_b))
        beta -= lr * grad
        # Track full-batch loss occasionally
        if t % max(1, iters//200) == 0:
            losses.append(np.mean((Xi @ beta - y)**2))
    return beta, np.array(losses)

# Run and compare
beta_gd, losses_gd = gd(Xtr_i, y_train, lr=1e-2, iters=800)
beta_sgd, losses_sgd = sgd(Xtr_i, y_train, lr=5e-3, iters=5000, batch_size=128)

plt.figure()
plt.plot(losses_gd, label="GD")
plt.plot(losses_sgd, label="SGD")
plt.title("Training loss vs steps")
plt.xlabel("Step")
plt.ylabel("MSE")
plt.legend()


In [None]:
# TODO: Evaluate RMSE on val for both; try different lrs and batch sizes; document what works and why.
yhat_gd = Xval_i @ beta_gd
yhat_sgd = Xval_i @ beta_sgd
print("GD RMSE (val):", rmse(y_val, yhat_gd))
print("SGD RMSE (val):", rmse(y_val, yhat_sgd))
