In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# =========================
# 1) Load data (same files as Problem 2)
# =========================
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

target = "price"

# =========================
# 2) Features/target + one-hot encode + align
# =========================
X_train = pd.get_dummies(train_df.drop(columns=[target]), drop_first=True)
y_train = train_df[target].to_numpy()

X_test  = pd.get_dummies(test_df.drop(columns=[target]), drop_first=True)
y_test  = test_df[target].to_numpy()

# Make sure train/test have the same columns (very important)
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

# Convert to numpy
Xtr = X_train.to_numpy()
Xte = X_test.to_numpy()

# =========================
# 3) Closed-form (Normal Equation) from class
#    theta = (X^T X)^(-1) X^T y
#    where X includes a column of 1s for intercept
# =========================
Xtr_b = np.c_[np.ones((Xtr.shape[0], 1)), Xtr]   # add bias column
Xte_b = np.c_[np.ones((Xte.shape[0], 1)), Xte]

# Use the closed-form least squares solution (equivalent to normal equation)
# lstsq solves: min ||Xtr_b * theta - y||^2
theta = np.linalg.lstsq(Xtr_b, y_train, rcond=None)[0]

# =========================
# 4) Prediction functions 
# =========================
def predict_one(x_row, theta):
    """
    Predict response for ONE new testing point.
    x_row: numpy array shape (d,) in SAME feature order as X_train.columns
    """
    x_row_b = np.r_[1.0, x_row]     # prepend 1 for intercept
    return float(x_row_b @ theta)

def predict_batch(X, theta):
    """
    Predict responses for MANY points.
    X: numpy array shape (N, d)
    """
    X_b = np.c_[np.ones((X.shape[0], 1)), X]
    return X_b @ theta

# Closed-form predictions
y_train_pred_cf = predict_batch(Xtr, theta)
y_test_pred_cf  = predict_batch(Xte, theta)

# Closed-form metrics
cf_train_mse = mean_squared_error(y_train, y_train_pred_cf)
cf_train_r2  = r2_score(y_train, y_train_pred_cf)
cf_test_mse  = mean_squared_error(y_test, y_test_pred_cf)
cf_test_r2   = r2_score(y_test, y_test_pred_cf)

print("=== Closed-form (my implementation) ===")
print("Training MSE:", cf_train_mse)
print("Training R^2:", cf_train_r2)
print("Testing  MSE:", cf_test_mse)
print("Testing  R^2:", cf_test_r2)

# =========================
# 5) Compare with package (Problem 2 sklearn LinearRegression)
# =========================
sk = LinearRegression()
sk.fit(Xtr, y_train)

y_train_pred_sk = sk.predict(Xtr)
y_test_pred_sk  = sk.predict(Xte)

sk_train_mse = mean_squared_error(y_train, y_train_pred_sk)
sk_train_r2  = r2_score(y_train, y_train_pred_sk)
sk_test_mse  = mean_squared_error(y_test, y_test_pred_sk)
sk_test_r2   = r2_score(y_test, y_test_pred_sk)

print("\n=== sklearn LinearRegression (Problem 2 package) ===")
print("Training MSE:", sk_train_mse)
print("Training R^2:", sk_train_r2)
print("Testing  MSE:", sk_test_mse)
print("Testing  R^2:", sk_test_r2)

# Optional: show they match (numerical precision)
print("\nMax abs coef difference:", np.max(np.abs(theta[1:] - sk.coef_)))
print("Abs intercept difference:", abs(theta[0] - sk.intercept_))

# Example of predict_one usage:
# pick the first test point
example_pred = predict_one(Xte[0], theta)
print("\nExample predict_one on first test sample:", example_pred)


=== Closed-form (my implementation) ===
Training MSE: 31043433137.295128
Training R^2: 0.7303787115834163
Testing  MSE: 58389284496.827225
Testing  R^2: 0.6497909600248234

=== sklearn LinearRegression (Problem 2 package) ===
Training MSE: 31043433137.295223
Training R^2: 0.7303787115834155
Testing  MSE: 58389284496.820786
Testing  R^2: 0.649790960024862

Max abs coef difference: 3.511621559937339e-05
Abs intercept difference: 4.008971154689789e-05

Example predict_one on first test sample: 704990.0797025487
