In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

drop_cols = ['Unnamed: 0', 'id', 'date', 'zipcode', 'price']

X_train = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])
y_train = train_df['price'] / 1000 

X_test  = test_df.drop(columns=[c for c in drop_cols if c in test_df.columns])
y_test = test_df['price'] / 1000

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:
X_train_cf = np.c_[np.ones(X_train_scaled.shape[0]), X_train_scaled]
X_test_cf = np.c_[np.ones(X_test_scaled.shape[0]), X_test_scaled]

def closed_form(X,y):
    return np.linalg.pinv(X.T @ X) @ X.T @ y

# Predict the response for a new testing point
def predict_linear(X, theta):
    return X @ theta

theta = closed_form(X_train_cf, y_train)

y_train_pred_cf = predict_linear(X_train_cf, theta)
y_test_pred_cf = predict_linear(X_test_cf, theta)

train_mse_cf = mean_squared_error(y_train, y_train_pred_cf)
train_r2_cf = r2_score(y_train, y_train_pred_cf)
test_mse_cf = mean_squared_error(y_test, y_test_pred_cf)
test_r2_cf = r2_score(y_test, y_test_pred_cf)

print(f"Closed-form Train MSE: {train_mse_cf}")
print(f"Closed-form Train R^2: {train_r2_cf}")
print(f"Closed-form Test MSE: {test_mse_cf}")
print(f"Closed-form Test R^2: {test_r2_cf}")

Closed-form Train MSE: 31486.167775794882
Closed-form Train R^2: 0.7265334318706018
Closed-form Test MSE: 57628.15470567033
Closed-form Test R^2: 0.6543560876120957


The metrics for Closed-Form Implementation are identical to those produced by the package in Problem 2.