In [8]:
import pandas as pd # type: ignore
import numpy as np # type: ignore

from sklearn.preprocessing import PolynomialFeatures # type: ignore
from sklearn.model_selection import train_test_split, GridSearchCV # type: ignore
from sklearn.linear_model import LinearRegression, Lasso, Ridge # type: ignore
from sklearn.metrics import mean_squared_error, r2_score # type: ignore
from sklearn.pipeline import make_pipeline # type: ignore
from sklearn.linear_model import LassoCV, RidgeCV # type: ignore

import matplotlib.pyplot as plt # type: ignore

### Import data

In [21]:
df_resale = pd.read_csv('../data/cleaned/standardized_encoded_delfated_resale_price.csv')
df_resale = df_resale.dropna()

In [13]:
df_resale.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196986 entries, 0 to 201058
Data columns (total 44 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   month                       196986 non-null  object 
 1   storey_range                196986 non-null  float64
 2   floor_area_sqm              196986 non-null  float64
 3   remaining_lease             196986 non-null  float64
 4   lat                         196986 non-null  float64
 5   lon                         196986 non-null  float64
 6   nearest_mrt_distance        196986 non-null  float64
 7   nearest_bus_distance        196986 non-null  float64
 8   education_score             196986 non-null  float64
 9   shopping_score              196986 non-null  float64
 10  food_score                  196986 non-null  float64
 11  recreation_score            196986 non-null  float64
 12  healthcare_score            196986 non-null  float64
 13  inflation_rate

Select features

In [22]:
feature_names = df_resale.columns.tolist()
exclude_cols = ['month', 'lat', 'lon','deflated_resale_price']  # Replace with actual column names
features_selected = [col for col in df_resale.columns if col not in exclude_cols]
X = df_resale[features_selected]
y = df_resale['deflated_resale_price']

Split data into training and test sets

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MLE

In [25]:
# initialize and fit model

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [28]:
print("Model Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

Model Coefficients: [ 1.75964560e-01  7.31075377e-01  4.71413041e-01 -1.40588742e-01
  4.09875384e-04 -3.33911610e-02  2.11691460e-02  6.01278823e-02
  6.99392000e-02  1.14465983e-01 -3.06602995e-04 -5.23355233e-02
 -5.06989144e-02 -1.87315885e-02 -1.08361705e-01 -2.98277188e-02
  6.38606978e-01 -2.88248110e-01  3.78676685e-01 -6.80844887e-01
  1.24628926e+00  2.91516893e-01 -7.47349441e-01  3.86476908e-01
 -1.68608960e-02 -4.15121320e-01 -1.31873142e-01 -6.41573806e-01
  1.35496930e-01  1.37901113e+00 -2.76952453e-01 -7.35645268e-01
  7.10037970e-01 -8.63822733e-01 -1.10545323e+00  2.39397982e-01
  2.59886088e-02  3.25708636e-01 -7.55410311e-01 -2.79393803e-01]
Intercept: 0.29854266905633947
Mean Squared Error: 0.134170112966285
R-squared Score: 0.8664435828963502


# Lasso

In [29]:
param_grid = {'alpha': np.logspace(-3, 1, 50)}

lasso = Lasso()
lasso_cv = GridSearchCV(lasso, param_grid, cv=5, scoring='r2')
lasso_cv.fit(X_train, y_train)

best_lasso = lasso_cv.best_estimator_

y_pred = best_lasso.predict(X_test)

mse_l = mean_squared_error(y_test, y_pred)
r2_l = r2_score(y_test, y_pred)

In [30]:
print("Best Lasso Alpha: ", lasso_cv.best_params_['alpha'])
print("Model Coefficients:", best_lasso.coef_)
print("Intercept:", best_lasso.intercept_)
print("Mean Squared Error:", mse_l)
print("R-squared Score:", r2_l)

"""
Intercept: 0.24731787132493452
Mean Squared Error: 0.11497220634118055
R-squared Score: 0.8855536780438713 
"""

Best Lasso Alpha:  0.001
Model Coefficients: [ 0.18213533  0.73106664  0.45658102 -0.12300947  0.00167682 -0.05932936
  0.01616479  0.122611    0.1106052   0.13987858  0.00168095 -0.04131501
 -0.04118249 -0.01906917 -0.09657023  0.01103406  0.62007379 -0.07905421
  0.28705655 -0.42459734  0.91870611  0.         -0.46178418  0.4286543
 -0.         -0.26509975  0.         -0.48168566  0.05271409  1.23152996
 -0.02641811 -0.4627159   0.70579153 -0.5895254  -0.89637663  0.21548913
  0.22705155  0.30989705 -0.46874231 -0.03391912]
Intercept: 0.15166262794838736
Mean Squared Error: 0.13730048632566919
R-squared Score: 0.8633275279059133


'\nIntercept: 0.24731787132493452\nMean Squared Error: 0.11497220634118055\nR-squared Score: 0.8855536780438713 \n'

Test for Interaction Effect - make no sense for location-related variables

In [None]:
# Create all pairwise interaction terms (without squared terms)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Use LassoCV for automatic feature selection with regularization
model = make_pipeline(poly, LassoCV(cv=5)).fit(X, y)

# Get coefficients
lasso = model.named_steps['lassocv']
feature_names = model.named_steps['polynomialfeatures'].get_feature_names_out(X.columns)


coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso.coef_})
selected_features = coef_df[coef_df['Coefficient'] != 0]

print(selected_features)

                                              Feature  Coefficient
0                                        storey_range     0.149198
1                                      floor_area_sqm     0.697456
2                                     remaining_lease     0.373798
3                                nearest_mrt_distance    -0.060381
4                                nearest_bus_distance     0.002442
..                                                ...          ...
605  inflation_rate (x100) resident_unemployment_rate     0.003342
606               inflation_rate (x100) interest_rate    -0.004584
655                resident_unemployment_rate fx_rate    -0.015289
656   resident_unemployment_rate avg_household_income     0.020327
749                      fx_rate avg_household_income     0.002210

[99 rows x 2 columns]


# Ridge

In [32]:
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')
ridge_cv.fit(X_train, y_train)
best_ridge = ridge_cv.best_estimator_

y_pred = best_ridge.predict(X_test)

mse_r = mean_squared_error(y_test, y_pred)
r2_r = r2_score(y_test, y_pred)

In [33]:
print("Best Ridge Alpha: ", ridge_cv.best_params_['alpha'])
print("Model Coefficients:", best_ridge.coef_)
print("Intercept:", best_ridge.intercept_)
print("Mean Squared Error:", mse_r)
print("R-squared Score:", r2_r)

"""Amenities_data_cleaning.ipynbIntercept: 0.15030498871214715
Mean Squared Error: 0.10682822058977902
R-squared Score: 0.893660413097256
"""

Best Ridge Alpha:  0.029470517025518096
Model Coefficients: [ 1.75965351e-01  7.31075347e-01  4.71410900e-01 -1.40585789e-01
  4.10171409e-04 -3.33953673e-02  2.11668941e-02  6.01365086e-02
  6.99467311e-02  1.14470069e-01 -3.06390839e-04 -5.23349083e-02
 -5.06984963e-02 -1.87316717e-02 -1.08360871e-01 -2.98092057e-02
  6.38613482e-01 -2.88209831e-01  3.78677190e-01 -6.80796426e-01
  1.24621523e+00  2.91488425e-01 -7.47296438e-01  3.86495916e-01
 -1.68533864e-02 -4.15091268e-01 -1.31836755e-01 -6.41542392e-01
  1.35500950e-01  1.37898033e+00 -2.76909612e-01 -7.35592858e-01
  7.10049660e-01 -8.63771738e-01 -1.10541218e+00  2.39407481e-01
  2.60317609e-02  3.25719218e-01 -7.55357286e-01 -2.79349284e-01]
Intercept: 0.29851088194310266
Mean Squared Error: 0.1341701364928894
R-squared Score: 0.8664435594773567


'Amenities_data_cleaning.ipynbIntercept: 0.15030498871214715\nMean Squared Error: 0.10682822058977902\nR-squared Score: 0.893660413097256\n'

Check for Multicollinearity

In [31]:
from statsmodels.stats.outliers_influence import variance_inflation_factor # type: ignore

# Compute VIF for each feature
X_train_vif = X_train.copy()  # Ensure we work with a copy
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_vif.values, i) for i in range(X_train_vif.shape[1])]

vif_data

Unnamed: 0,Feature,VIF
0,storey_range,1.24079
1,floor_area_sqm,1.181057
2,remaining_lease,1.779705
3,nearest_mrt_distance,1.890704
4,nearest_bus_distance,1.053948
5,education_score,2.389122
6,shopping_score,2.534772
7,food_score,5.119843
8,recreation_score,3.062364
9,healthcare_score,6.323754


In [58]:
vif_data

Unnamed: 0,Feature,VIF
0,flat_type,21.410640
1,storey_range,1.268476
2,floor_area_sqm,15.247786
3,remaining_lease,316.782651
4,nearest_mrt_distance,1.933301
...,...,...
58,flat_model_Simplified,3.268270
59,flat_model_Standard,2.836043
60,flat_model_Terrace,1.082173
61,flat_model_Type S1,1.445011


Check for Interactive Effect

In [55]:
correlations = X_train.corrwith(y_train)
print(correlations.abs().sort_values(ascending=False))

floor_area_sqm          0.646445
flat_type               0.640392
storey_range            0.372191
building_age_2025       0.350866
remaining_lease         0.350195
                          ...   
town_GEYLANG            0.015010
flat_model_3Gen         0.010742
town_HOUGANG            0.008931
town_SENGKANG           0.005379
avg_household_income    0.003691
Length: 63, dtype: float64


In [56]:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

# Create polynomial feature transformer (degree=2 for interaction terms)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Transform the dataset
X_train_interactions = poly.fit_transform(X_train)

# Create feature names
feature_names = poly.get_feature_names_out(X_train.columns)

# Convert back to DataFrame
X_train_interactions = pd.DataFrame(X_train_interactions, columns=feature_names)


In [54]:
# Compute correlation of interaction terms with target
interaction_correlations = X_train_interactions.corrwith(y_train)

# Sort by absolute correlation value
strong_interactions = interaction_correlations.abs().sort_values(ascending=False)

# Display top 10 most relevant interaction terms
print("Top 10 Strongest Interaction Terms:")
print(strong_interactions.head(10))


Top 10 Strongest Interaction Terms:
storey_range town_CHOA CHU KANG              0.011254
town_BISHAN flat_model_Adjoined flat         0.009512
remaining_lease nearest_bus_distance         0.008969
nearest_bus_distance building_age_2025       0.008896
education_score healthcare_score             0.008872
total_unemployment_rate town_TAMPINES        0.008850
resident_unemployment_rate town_TAMPINES     0.008792
nearest_bus_distance town_PASIR RIS          0.008695
floor_area_sqm fx_rate                       0.008174
town_QUEENSTOWN flat_model_New Generation    0.007437
dtype: float64
