In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

TARGET = "price"

def main():
    # already cleaned trained and test data 
    train_path = "train.csv"
    test_path  = "test.csv"

    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)

    # Split X/y (price already divided by 1000 in cleaned data)
    y_train = train_df[TARGET].to_numpy(dtype=float)
    y_test  = test_df[TARGET].to_numpy(dtype=float)

    X_train = train_df.drop(columns=[TARGET])
    X_test  = test_df.drop(columns=[TARGET])

    # removed zipcode/id/date during cleaning, no need to drop here.
    # If any categoricals still exist, one-hot encode
    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test  = pd.get_dummies(X_test, drop_first=True)

    # Align columns
    X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

    feature_names = X_train.columns.tolist()

    # Fit model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict
    y_train_pred = model.predict(X_train)
    y_test_pred  = model.predict(X_test)

    # Metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2  = r2_score(y_train, y_train_pred)
    test_mse  = mean_squared_error(y_test, y_test_pred)
    test_r2   = r2_score(y_test, y_test_pred)

    print("=== Problem 2: Multiple Linear Regression ===")
    print("Train MSE:", train_mse)
    print("Train R^2:", train_r2)
    print("Test  MSE:", test_mse)
    print("Test  R^2:", test_r2)

    # Coefficients
    coef_df = pd.DataFrame({
        "feature": feature_names,
        "coef": model.coef_
    })
    coef_df["abscoef"] = np.abs(coef_df["coef"])
    coef_df = coef_df.sort_values("abscoef", ascending=False).drop(columns=["abscoef"])

    print("\nIntercept:", model.intercept_)
    print("\nTop 15 coefficients by |value|:")
    print(coef_df.head(15).to_string(index=False))

if __name__ == "__main__":
    main()


=== Problem 2: Multiple Linear Regression ===
Train MSE: 31415.747916100867
Train R^2: 0.7271450489303788
Test  MSE: 58834.673978213985
Test  R^2: 0.6471195893437872

Intercept: 520.414834000001

Top 15 coefficients by |value|:
      feature       coef
        grade  92.557366
          lat  78.168947
     yr_built -68.077221
   waterfront  64.263051
  sqft_living  57.190184
   sqft_above  48.463289
         view  47.634111
sqft_living15  45.501884
sqft_basement  27.702667
    bathrooms  18.466149
 yr_renovated  17.350603
   sqft_lot15 -12.913018
     bedrooms -12.813747
    condition  12.653938
     sqft_lot  11.132906
