In [139]:
import numpy as np
import pandas as pd
from sklearn.model_selection import LeaveOneOut

In [140]:
def build_model1(x1, x2, x3, verbose=False):
    if verbose:
        print("\n[Model 1]")
        print("y = alpha1*x1 + alpha2*x2 + alpha3*x3 + alpha0")
    return np.column_stack([x1, x2, x3, np.ones_like(x1)])

def build_model2(x1, x2, x3, verbose=False):
    if verbose:
        print("\n[Model 2]")
        print("y = beta1*(x1^2) + beta2*(x2^2) + beta3*(x3^2) + beta0")
    return np.column_stack([x1**2, x2**2, x3**2, np.ones_like(x1)])

def build_model3(x1, x2, x3, verbose=False):
    if verbose:
        print("\n[Model 3]")
        print("y = gamma1*(x1^3) + gamma2*(x2^3) + gamma3*(x3^3) + gamma0")
    return np.column_stack([x1**3, x2**3, x3**3, np.ones_like(x1)])

In [141]:
def OrdinaryLeastSquares_fit(X, y, ridge_eps=0.0):
    XtX = X.T @ X
    if ridge_eps > 0:
        XtX = XtX + ridge_eps * np.eye(XtX.shape[0])
    Xty = X.T @ y
    w = np.linalg.solve(XtX, Xty)
    return w

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

# i. Load the data

In [142]:
def load_data(csv_path, x_cols, y_col):
    df = pd.read_csv(csv_path)
    df.columns = df.columns.str.strip()
    missing = [c for c in [*x_cols, y_col] if c not in df.columns]
    if missing:
        raise ValueError(f"Columns not found: {missing}")
    df = df[[*x_cols, y_col]].dropna().copy()
    x1, x2, x3 = df[x_cols[0]].to_numpy(), df[x_cols[1]].to_numpy(), df[x_cols[2]].to_numpy()
    y = df[y_col].to_numpy()
    return x1, x2, x3, y


# i. Fit the dataset

In [143]:
def fit_full_dataset(x1, x2, x3, y, x_cols):
    models = [
        ("Model 1", build_model1, "alpha", "y = alpha1*x1 + alpha2*x2 + alpha3*x3 + alpha0"),
        ("Model 2", build_model2, "beta", "y = beta1*(x1^2) + beta2*(x2^2) + beta3*(x3^2) + beta0"),
        ("Model 3", build_model3, "gamma", "y = gamma*(x1^3) + gamma2*(x2^3) + gamma3*(x3^3) + gamma0")
    ]
    results = []
    for name, builder, sym, form in models:
        X = builder(x1, x2, x3)
        w = OrdinaryLeastSquares_fit(X, y)
        y_pred = X @ w
        train_rmse = rmse(y, y_pred)

        print(f"\n{name} {form}")
        for label, val in zip(
            [f"{sym}1 ({x_cols[0]})", f"{sym}2 ({x_cols[1]})", f"{sym}3 ({x_cols[2]})", f"{sym}0 (bias)"],
            w.flatten()
        ):
            print(f" {label}: {float(val):.6f}")
        print(f"Training RMSE: {train_rmse:.6f}")
        results.append((name, builder, X, w))
    return results

# ii. LOOCV

In [144]:
def loocv_evaluate(x1, x2, x3, y, results):
    rmse_scores = []
    for name, builder, _, _ in results:
        X = builder(x1, x2, x3)
        loo = LeaveOneOut()
        preds = np.zeros_like(y, dtype=float)
        for train_idx, test_idx in loo.split(X):
            X_train, y_train = X[train_idx], y[train_idx]
            X_test = X[test_idx]
            w = OrdinaryLeastSquares_fit(X_train, y_train)
            preds[test_idx] = (X_test @ w).ravel()
        score = rmse(y, preds)
        rmse_scores.append((name, score))
        print(f"LOOCV RMSE ({name}): {score:.6f}")
    best = min(rmse_scores, key=lambda t: t[1])
    print(f"\nBest Model based on LOOCV RMSE: {best[0]}")
    return rmse_scores



# Run and Print Reports

In [145]:
def main():
    csv_path = "HW4_data/real_estate.csv"
    x_cols = ["X1 house age", "X2 distance to the nearest MRT station", "X3 number of convenience stores"]
    y_col = "Y house price of unit area"

    x1, x2, x3, y = load_data(csv_path, x_cols, y_col)
    print("Dataset Info")
    print(f"X columns: {x_cols}")
    print(f"y column: {y_col}")
    print(f"Samples: {len(y)}")

    results = fit_full_dataset(x1, x2, x3, y, x_cols)

    loocv_evaluate(x1, x2, x3, y, results)

if __name__ == "__main__":
    main()

Dataset Info
X columns: ['X1 house age', 'X2 distance to the nearest MRT station', 'X3 number of convenience stores']
y column: Y house price of unit area
Samples: 414

Model 1 y = alpha1*x1 + alpha2*x2 + alpha3*x3 + alpha0
 alpha1 (X1 house age): -0.233104
 alpha2 (X2 distance to the nearest MRT station): -0.005512
 alpha3 (X3 number of convenience stores): 1.298058
 alpha0 (bias): 42.748076
Training RMSE: 10.087057

Model 2 y = beta1*(x1^2) + beta2*(x2^2) + beta3*(x3^2) + beta0
 beta1 (X1 house age): -0.003425
 beta2 (X2 distance to the nearest MRT station): -0.000001
 beta3 (X3 number of convenience stores): 0.179033
 beta0 (bias): 37.498945
Training RMSE: 11.176234

Model 3 y = gamma*(x1^3) + gamma2*(x2^3) + gamma3*(x3^3) + gamma0
 gamma1 (X1 house age): -0.000029
 gamma2 (X2 distance to the nearest MRT station): -0.000000
 gamma3 (X3 number of convenience stores): 0.017768
 gamma0 (bias): 36.714452
Training RMSE: 12.026067
LOOCV RMSE (Model 1): 10.197973
LOOCV RMSE (Model 2): 11.3