In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df = train_df.drop(columns=['id', 'date', 'zipcode', 'Unnamed: 0'], errors='ignore')
test_df = test_df.drop(columns=['id', 'date', 'zipcode', 'Unnamed: 0'], errors='ignore')

X_train = train_df.drop(columns=['price'])
y_train = train_df['price']
X_test = test_df.drop(columns=['price'])
y_test = test_df['price']

train_mean = X_train.mean()
train_std = X_train.std()

X_train = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std

y_train = y_train / 1000
y_test_scaled = y_test / 1000

model = LinearRegression()
model.fit(X_train, y_train)

sk_train_pred = model.predict(X_train)
sk_test_pred = model.predict(X_test_scaled)

sk_train_mse = mean_squared_error(y_train, sk_train_pred)
sk_train_r2 = r2_score(y_train, sk_train_pred)
sk_test_mse = mean_squared_error(y_test_scaled, sk_test_pred)
sk_test_r2 = r2_score(y_test_scaled, sk_test_pred)

coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': model.coef_})
print(coef_df)
print(f"Train MSE: {sk_train_mse:.4f}")
print(f"Train R2:  {sk_train_r2:.4f}")
print(f"Test MSE:  {sk_test_mse:.4f}")
print(f"Test R2:   {sk_test_r2:.4f}")
print("-" * 30)

X_train_cf = X_train.copy()
X_test_cf = X_test_scaled.copy()
X_train_cf.insert(0, 'intercept', 1)
X_test_cf.insert(0, 'intercept', 1)

X = X_train_cf.values
y = y_train.values

XtX = X.T.dot(X)
Xty = X.T.dot(y)
# Using psudoinverse as mentioned in lec5 slides
beta = np.linalg.pinv(XtX).dot(Xty)

def predict_closed_form(X_new_df, beta_coeffs):
    if 'intercept' not in X_new_df.columns:
        X_new = X_new_df.copy()
        X_new.insert(0, 'intercept', 1)
    else:
        X_new = X_new_df
    
    return X_new.values.dot(beta_coeffs)

cf_train_pred = predict_closed_form(X_train_cf, beta)
cf_test_pred = predict_closed_form(X_test_cf, beta)

cf_train_mse = mean_squared_error(y_train, cf_train_pred)
cf_train_r2 = r2_score(y_train, cf_train_pred)
cf_test_mse = mean_squared_error(y_test_scaled, cf_test_pred)
cf_test_r2 = r2_score(y_test_scaled, cf_test_pred)

cf_coefs = pd.DataFrame({'Feature': X_train_cf.columns, 'Coefficient': beta})
print(cf_coefs)
print(f"Train MSE: {cf_train_mse:.4f}")
print(f"Train R2:  {cf_train_r2:.4f}")
print(f"Test MSE:  {cf_test_mse:.4f}")
print(f"Test R2:   {cf_test_r2:.4f}")

print("\n--- Comparison of Metrics ---")
print(f"{'Metric':<15} {'Scikit-learn':<15} {'Closed-form':<15} {'Difference':<15}")
print(f"{'Train MSE':<15} {sk_train_mse:<15.4f} {cf_train_mse:<15.4f} {abs(sk_train_mse - cf_train_mse):<15.4e}")
print(f"{'Train R2':<15} {sk_train_r2:<15.4f} {cf_train_r2:<15.4f} {abs(sk_train_r2 - cf_train_r2):<15.4e}")
print(f"{'Test MSE':<15} {sk_test_mse:<15.4f} {cf_test_mse:<15.4f} {abs(sk_test_mse - cf_test_mse):<15.4e}")
print(f"{'Test R2':<15} {sk_test_r2:<15.4f} {cf_test_r2:<15.4f} {abs(sk_test_r2 - cf_test_r2):<15.4e}")

          Feature  Coefficient
0        bedrooms   -12.528228
1       bathrooms    18.536903
2     sqft_living    56.777233
3        sqft_lot    10.887313
4          floors     8.047746
5      waterfront    63.774795
6            view    48.224227
7       condition    12.970756
8           grade    92.277625
9      sqft_above    48.314252
10  sqft_basement    27.150611
11       yr_built   -67.676964
12   yr_renovated    17.280022
13            lat    78.414954
14           long    -1.035721
15  sqft_living15    45.600464
16     sqft_lot15   -12.936561
Train MSE: 31486.1678
Train R2:  0.7265
Test MSE:  57628.1547
Test R2:   0.6544
------------------------------
          Feature  Coefficient
0       intercept   520.414834
1        bedrooms   -12.528228
2       bathrooms    18.536903
3     sqft_living    56.777233
4        sqft_lot    10.887313
5          floors     8.047746
6      waterfront    63.774795
7            view    48.224227
8       condition    12.970756
9           grade    

In [None]:
# The results from the closed-form implementation are identical (or extremely close) to the Scikit-learn package implementation.
# Which is exptected since they are trained using the same data and the same approach