In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df = train_df.drop(columns=['id', 'date', 'zipcode', 'Unnamed: 0'], errors='ignore')
test_df = test_df.drop(columns=['id', 'date', 'zipcode', 'Unnamed: 0'], errors='ignore')

X_train = train_df.drop(columns=['price'])
y_train = train_df['price']
X_test = test_df.drop(columns=['price'])
y_test = test_df['price']

train_mean = X_train.mean()
train_std = X_train.std()

X_train = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std

y_train = y_train / 1000
y_test_scaled = y_test / 1000

X_raw_train = train_df['sqft_living'].values
X_raw_test = test_df['sqft_living'].values

results = []

for p in range(1, 6):
    X_train = np.vstack([X_raw_train**i for i in range(1, p+1)]).T
    X_test = np.vstack([X_raw_test**i for i in range(1, p+1)]).T
    
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    
    X_train_scaled = (X_train - mean) / std
    X_test_scaled = (X_test - mean) / std
    
    X_train_bias = np.c_[np.ones(X_train_scaled.shape[0]), X_train_scaled]
    X_test_bias = np.c_[np.ones(X_test_scaled.shape[0]), X_test_scaled]
    
    XtX = X_train_bias.T.dot(X_train_bias)
    Xty = X_train_bias.T.dot(y_train)
    beta = np.linalg.pinv(XtX).dot(Xty)
    
    train_pred = X_train_bias.dot(beta)
    test_pred = X_test_bias.dot(beta)
    
    tr_mse = mean_squared_error(y_train, train_pred)
    tr_r2 = r2_score(y_train, train_pred)
    te_mse = mean_squared_error(y_test_scaled, test_pred)
    te_r2 = r2_score(y_test_scaled, test_pred)
    
    results.append({
        'Degree': p,
        'Train MSE': tr_mse,
        'Train R2': tr_r2,
        'Test MSE': te_mse,
        'Test R2': te_r2
    })

df = pd.DataFrame(results)
print(df.to_string(index=False))

 Degree    Train MSE  Train R2     Test MSE     Test R2
      1 57947.526161  0.496709 8.857598e+04    0.468736
      2 54822.665116  0.523849 7.179168e+04    0.569406
      3 53785.194716  0.532860 9.983348e+04    0.401216
      4 52795.774758  0.541453 2.509793e+05   -0.505331
      5 52626.111955  0.542927 2.865728e+07 -170.881541


In [None]:
# As the degree increase, the MSE going down and R2 going up, which is expected and make the model have a better result