In [8]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

# ============================================================
# Problem 6.2 â€” Ridge Gradient Descent
# ============================================================

def ridge_gradient_descent(Xb, y, alpha, num_iters, lam):

    N, d = Xb.shape
    theta = np.zeros(d, dtype=float)

    for _ in range(num_iters):
        preds = Xb @ theta

        # Standard MSE gradient (lecture formula)
        grad_mse = (2.0 / N) * (Xb.T @ (preds - y))

        # Ridge penalty gradient
        grad_ridge = (2.0 * lam / N) * theta
        grad_ridge[0] = 0.0  # Do NOT penalize intercept

        theta -= alpha * (grad_mse + grad_ridge)

    return theta


In [12]:
import numpy as np

np.random.seed(0)

# ---------------------------
# Simulate Data... problem 6.3
# ---------------------------

N = 1000
X = np.random.uniform(-2, 2, size=(N, 1))
e = np.random.normal(0, 2, size=N)

y = 1 + 2 * X[:, 0] + e

Xb = add_intercept(X)

# ---------------------------
# Ordinary Linear Regression (lambda = 0)
# ---------------------------

theta_lin = ridge_gradient_descent(Xb, y, alpha=0.05, num_iters=5000, lam=0)
y_hat_lin = predict(Xb, theta_lin)

print("=== Linear Regression ===")
print("theta:", theta_lin)
print("slope:", theta_lin[1])
print("MSE:", mse(y, y_hat_lin))
print("R2:", r2_score(y, y_hat_lin))
print()

# ---------------------------
# Ridge for different lambdas
# ---------------------------

lambdas = [1, 10, 100, 1000, 10000]

for lam in lambdas:
    theta_ridge = ridge_gradient_descent(Xb, y, alpha=0.05, num_iters=5000, lam=lam)
    y_hat_ridge = predict(Xb, theta_ridge)

    print(f"=== Ridge (lambda={lam}) ===")
    print("theta:", theta_ridge)
    print("slope:", theta_ridge[1])
    print("MSE:", mse(y, y_hat_ridge))
    print("R2:", r2_score(y, y_hat_ridge))
    print()


=== Linear Regression ===
theta: [1.05729684 1.95148117]
slope: 1.9514811687620026
MSE: 3.7307480978868752
R2: 0.5796981110971798

=== Ridge (lambda=1) ===
theta: [1.05727329 1.95003794]
slope: 1.950037939516078
MSE: 3.73075091223866
R2: 0.57969779403547

=== Ridge (lambda=10) ===
theta: [1.05706295 1.9371443 ]
slope: 1.9371443025179327
MSE: 3.7310258236744818
R2: 0.5796668228219961

=== Ridge (lambda=100) ===
theta: [1.055103   1.81700412]
slope: 1.8170041192766677
MSE: 3.755182633173177
R2: 0.5769453438060533

=== Ridge (lambda=1000) ===
theta: [1.04375622 1.12147441]
slope: 1.1214744107581074
MSE: 4.661579437744542
R2: 0.4748316982150782

=== Ridge (lambda=10000) ===
theta: [1.0292502  0.23229066]
slope: 0.23229065857591738
MSE: 7.72426705617281
R2: 0.12979275230667153

