In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

# =========================
# Problem 4: Polynomial Regression (Least Squares)
# Feature: sqft_living only
# =========================

def polynomial_features(X, p):
    """
    X: shape (N, 1)
    returns: shape (N, p) = [X, X^2, ..., X^p]
    """
    return np.hstack([X**i for i in range(1, p + 1)])

def add_intercept(X):
    """Add a column of ones for theta_0 (intercept)."""
    return np.c_[np.ones((X.shape[0], 1)), X]

def fit_closed_form_least_squares(X, y):
    """
    Closed-form least squares using matrix operations.
    Uses lstsq for numerical stability (still solving the normal equations objective).
    """
    Xb = add_intercept(X)
    theta = np.linalg.lstsq(Xb, y, rcond=None)[0]
    return theta

def predict_one(x_row, theta):
    """
    Predict for ONE new point.
    x_row: shape (p,) polynomial features in SAME order [x, x^2, ..., x^p]
    """
    x_row_b = np.r_[1.0, x_row]  # add intercept term
    return float(x_row_b @ theta)

def predict_batch(X, theta):
    """
    Predict for MANY points.
    X: shape (N, p)
    """
    Xb = add_intercept(X)
    return Xb @ theta

def evaluate(y_true, y_pred):
    return mean_squared_error(y_true, y_pred), r2_score(y_true, y_pred)

def main():
    # Load CLEANED data (price already /1000, sqft_living already standardized)
    train_df = pd.read_csv("train.csv")
    test_df  = pd.read_csv("test.csv")

    # Use ONLY sqft_living (as instructed)
    X_train_raw = train_df[["sqft_living"]].to_numpy(dtype=float)  # shape (N, 1)
    y_train = train_df["price"].to_numpy(dtype=float)

    X_test_raw = test_df[["sqft_living"]].to_numpy(dtype=float)
    y_test = test_df["price"].to_numpy(dtype=float)

    results = []
    degrees = [1, 2, 3, 5]  # at least 3 values, all <= 5

    for p in degrees:
        # Build polynomial features
        Xtr_p = polynomial_features(X_train_raw, p)
        Xte_p = polynomial_features(X_test_raw, p)

        # Fit least squares model
        theta_p = fit_closed_form_least_squares(Xtr_p, y_train)

        # Predict
        ytr_pred = predict_batch(Xtr_p, theta_p)
        yte_pred = predict_batch(Xte_p, theta_p)

        # Metrics
        tr_mse, tr_r2 = evaluate(y_train, ytr_pred)
        te_mse, te_r2 = evaluate(y_test, yte_pred)

        results.append({
            "Degree (p)": p,
            "Train MSE": tr_mse,
            "Train R^2": tr_r2,
            "Test MSE": te_mse,
            "Test R^2": te_r2
        })

    results_df = pd.DataFrame(results)
    print("=== Problem 4: Polynomial Regression Results (sqft_living only) ===")
    print(results_df.to_string(index=False))

    # Example: predict_one demonstration (like Problem 3 requirement)
    p_demo = degrees[0]
    x0 = X_test_raw[0:1]                 # shape (1,1)
    x0_poly = polynomial_features(x0, p_demo)[0]  # shape (p_demo,)
    theta_demo = fit_closed_form_least_squares(polynomial_features(X_train_raw, p_demo), y_train)
    pred0 = predict_one(x0_poly, theta_demo)
    print(f"\nExample predict_one (p={p_demo}) on first test sample:", pred0)

if __name__ == "__main__":
    main()


=== Problem 4: Polynomial Regression Results (sqft_living only) ===
 Degree (p)    Train MSE  Train R^2      Test MSE  Test R^2
          1 57947.526161   0.496709  88575.978543  0.468736
          2 54822.665116   0.523849  71791.679479  0.569406
          3 53785.194716   0.532860  99833.483763  0.401216
          5 52626.111955   0.542927 570616.914821 -2.422464

Example predict_one (p=1) on first test sample: 460.81090535196427
