In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

drop_cols = ['Unnamed: 0', 'id', 'date', 'zipcode', 'price']

X_train = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])
y_train = train_df['price'] / 1000 

X_test  = test_df.drop(columns=[c for c in drop_cols if c in test_df.columns])
y_test = test_df['price'] / 1000

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:
def closed_form(X, y):
    return np.linalg.pinv(X.T @ X) @ X.T @ y

def get_polynomial_features(X, degree):
    return np.hstack([X**i for i in range(1, degree + 1)])

poly_results = []

for p in range(1, 6):
    X_train_poly = get_polynomial_features(X_train[['sqft_living']].values, p)
    X_test_poly = get_polynomial_features(X_test[['sqft_living']].values, p)
    
    scaler_poly = StandardScaler()
    X_train_scaled = scaler_poly.fit_transform(X_train_poly)
    X_test_scaled = scaler_poly.transform(X_test_poly)
    
    X_train_final = np.hstack([np.ones((X_train_scaled.shape[0], 1)), X_train_scaled])
    X_test_final = np.hstack([np.ones((X_test_scaled.shape[0], 1)), X_test_scaled])
    
    theta_poly = closed_form(X_train_final, y_train)
    
    y_train_pred = X_train_final @ theta_poly
    y_test_pred = X_test_final @ theta_poly
    
    poly_results.append({
        "p": p,
        "Train MSE": mean_squared_error(y_train, y_train_pred),
        "Train R2": r2_score(y_train, y_train_pred),
        "Test MSE": mean_squared_error(y_test, y_test_pred),
        "Test R2": r2_score(y_test, y_test_pred)
    })

poly_df = pd.DataFrame(poly_results)
print(poly_df)

   p     Train MSE  Train R2      Test MSE     Test R2
0  1  57947.526161  0.496709  8.857598e+04    0.468736
1  2  54822.665116  0.523849  7.179168e+04    0.569406
2  3  53785.194716  0.532860  9.983348e+04    0.401216
3  4  52795.774758  0.541453  2.509793e+05   -0.505331
4  5  52626.111955  0.542927  2.865728e+07 -170.881541


As the degree of the polynomial increases, the Train MSE decreases from 57947.53 at degree 1 to 52626.11 at degree 5 and the Train R^2 increases from 0.497 to 0.543. By increasing the degree of the polynomial, it creates more flexible model that fits the training data more closely. However, the Test MSE decreases from degree 1(88575.98) to degree 2 (71791.68) hitting its lowest, and the test R^2 reaches its highest value at degree 2(0.569). For degree larger than 2, the Test MSE increases and the test R^2 deteriorates significantly, dropping to -170.88 at degree 5. These results are clear demonstration of overfitting, where the model becomes too complex and sensitive to specific noise in the training data. The Test R^2 becomes negative at degree of 4 and 5. This indicates that the model’s predictions are so inaccurate that predictions are farther from the true values than the simple mean would be. In conclusion, the polynomial regression with degree 2 is the most effective for this dataset. It is the best trade-off between bias and variance, achieving highest accuracy(the lowest test MSE and highest test R^2) on unseen data before the model begins to overfit.