# 09 — Probability Review + Maximum Likelihood Estimation

**Goal:** Derive MLE for Normal (μ, σ²) and show OLS ≡ MLE under Gaussian noise.


In [None]:
import numpy as np

# PART A: MLE for Normal
# TODO: Derive by hand (pen+paper), then verify with code below.
np.random.seed(0)
x = np.random.randn(2000) * 2.5 + 3.0  # true mu=3, sigma=2.5

def mle_normal(x):
    mu_hat = float(np.mean(x))
    sigma2_hat = float(np.mean((x - mu_hat)**2))  # MLE uses 1/n, not 1/(n-1)
    return mu_hat, sigma2_hat

mu_hat, sigma2_hat = mle_normal(x)
print("μ_hat:", mu_hat, "σ²_hat:", sigma2_hat)

# PART B: OLS as MLE with Gaussian noise
# y = Xβ + ε, ε ~ N(0, σ² I)
# Show that maximizing likelihood == minimizing MSE.
# TODO: Write a short explanation (markdown cell) and then verify numerically below.


In [None]:
import warnings
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing, make_regression
from sklearn.model_selection import train_test_split

def load_regression_data(random_state=42):
    """Return (X, y, feature_names) as numpy arrays.
    Try California Housing; fallback to synthetic if unavailable (e.g., offline).
    """
    try:
        cali = fetch_california_housing(as_frame=True)
        df = cali.frame.copy()
        X = df.drop(columns=["MedHouseVal"]).values
        y = df["MedHouseVal"].values
        feature_names = list(df.drop(columns=["MedHouseVal"]).columns)
    except Exception as e:
        warnings.warn(f"California Housing fetch failed: {e}. Falling back to synthetic make_regression.")
        X, y = make_regression(n_samples=5000, n_features=8, n_informative=6, noise=8.5, random_state=random_state)
        feature_names = [f"x{i}" for i in range(X.shape[1])]
    return X, y, feature_names

def train_val_test_split(X, y, random_state=42):
    # 60/20/20 split: train/val/test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_state)
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

def r2(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    return float(1 - ss_res/ss_tot)


In [None]:
# Numerical check: OLS vs maximizing Gaussian log-lik differ by constants -> same beta.
import numpy as np
from numpy.linalg import pinv

X, y, feature_names = load_regression_data()
(X_train, y_train), (X_val, y_val), (X_test, y_test) = train_val_test_split(X, y)

Xtr_i = np.c_[np.ones((X_train.shape[0], 1)), X_train]
beta_hat = pinv(Xtr_i) @ y_train

# TODO: Implement a function that evaluates Gaussian log-likelihood for a given beta and sigma^2 (use MLE sigma^2).
def gaussian_loglik(Xi, y, beta):
    n = Xi.shape[0]
    resid = y - Xi @ beta
    sigma2 = np.mean(resid**2)  # MLE of σ² given beta
    # log-lik up to additive constant: - (n/2) * log(sigma^2) - (1/(2*sigma^2)) * sum(resid^2)
    return -0.5*n*np.log(sigma2) - 0.5*np.sum(resid**2)/sigma2

ll_at_beta_hat = gaussian_loglik(Xtr_i, y_train, beta_hat)
print("Log-likelihood at OLS beta:", ll_at_beta_hat)

# TODO: Randomly sample other betas and show none has higher log-lik than OLS within tolerance.


**Write-up (required):**  
Explain in your own words why OLS is the MLE under Gaussian i.i.d. noise, and when that assumption breaks.
