In [6]:
import pandas as pd # type: ignore
import numpy as np # type: ignore

from sklearn.preprocessing import PolynomialFeatures # type: ignore
from sklearn.model_selection import train_test_split, GridSearchCV # type: ignore
from sklearn.linear_model import LinearRegression, Lasso, Ridge # type: ignore
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # type: ignore
from sklearn.pipeline import make_pipeline # type: ignore
from sklearn.linear_model import LassoCV, RidgeCV # type: ignore

import matplotlib.pyplot as plt # type: ignore

### Import data

In [2]:
df_resale = pd.read_csv('../data/cleaned/standardized_encoded_delfated_resale_price.csv')
df_resale = df_resale.dropna()

In [13]:
df_resale.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196986 entries, 0 to 201058
Data columns (total 44 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   month                       196986 non-null  object 
 1   storey_range                196986 non-null  float64
 2   floor_area_sqm              196986 non-null  float64
 3   remaining_lease             196986 non-null  float64
 4   lat                         196986 non-null  float64
 5   lon                         196986 non-null  float64
 6   nearest_mrt_distance        196986 non-null  float64
 7   nearest_bus_distance        196986 non-null  float64
 8   education_score             196986 non-null  float64
 9   shopping_score              196986 non-null  float64
 10  food_score                  196986 non-null  float64
 11  recreation_score            196986 non-null  float64
 12  healthcare_score            196986 non-null  float64
 13  inflation_rate

Select features

In [3]:
feature_names = df_resale.columns.tolist()
exclude_cols = ['month', 'lat', 'lon','deflated_resale_price']  # Replace with actual column names
features_selected = [col for col in df_resale.columns if col not in exclude_cols]
X = df_resale[features_selected]
y = df_resale['deflated_resale_price']

Split data into training and test sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MLE

In [8]:
# initialize and fit model

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# evaluation
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print("Model Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

print("R-squared Score:", r2)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

"""Intercept: 0.29854266905633947
Mean Squared Error: 0.134170112966285
R-squared Score: 0.8664435828963502

R-squared Score: 0.820683629869053
Mean Squared Error: 0.18179093755150225
Mean Absolute Error: 0.32170451015806445
"""

Model Coefficients: [ 1.88212170e-01  7.36915322e-01  4.37369518e-01 -9.23705179e-02
  8.27566387e-03  1.99567842e-02  7.20264049e-03  1.41679500e-01
  1.50529403e-01  3.00876776e-02  8.75853039e-03 -1.98565769e-02
 -2.66511578e-02 -2.41345107e-02 -6.23612046e-02 -3.43224617e+00
 -2.35822669e+00 -3.68098236e+00 -4.20612963e+00 -1.06708105e+01
 -5.94212820e+00 -5.23893356e-01 -8.97651616e+00  3.29833179e+00
  2.20424144e+00  5.81738455e+00  3.10529283e-01 -3.66668496e-01
 -2.16743116e-01 -5.25999246e-01 -4.22499178e-01 -7.92436186e-01
 -3.85420067e-01 -3.13692243e-01 -9.24045584e-01 -7.41808573e-01
 -4.29064664e-01 -4.38639747e-01 -2.42330995e-01 -5.42107580e-01
 -8.81892219e-01 -4.75720436e-01 -6.91884747e-01 -1.09511876e+00
 -8.73753796e-01 -1.19617507e+00 -1.35895699e+00 -7.72152630e-01
 -7.14371372e-01 -4.11068608e-01 -7.52962293e-01]
Intercept: -0.07444549865436037
R-squared Score: 0.820683629869053
Mean Squared Error: 0.18179093755150225
Mean Absolute Error: 0.32170451015806445


'Intercept: 0.29854266905633947\nMean Squared Error: 0.134170112966285\nR-squared Score: 0.8664435828963502\n\nMean Absolute Error:\nMean Squared Error: 0.18179093755150225\nR-squared Score: 0.820683629869053\n'

# Lasso

In [10]:
param_grid = {'alpha': np.logspace(-3, 1, 50)}

lasso = Lasso()
lasso_cv = GridSearchCV(lasso, param_grid, cv=5, scoring='r2')
lasso_cv.fit(X_train, y_train)

best_lasso = lasso_cv.best_estimator_

y_pred = best_lasso.predict(X_test)

mse_l = mean_squared_error(y_test, y_pred)
r2_l = r2_score(y_test, y_pred)
mae_l = mean_absolute_error(y_test, y_pred)

In [None]:
print("Best Lasso Alpha: ", lasso_cv.best_params_['alpha'])
print("Model Coefficients:", best_lasso.coef_)
print("Intercept:", best_lasso.intercept_)

print("R-squared Score:", r2_l)
print("Mean Squared Error:", mse_l)
print("Mean Absolute Error:", mae_l)


"""
Mean Squared Error: 0.11497220634118055
R-squared Score: 0.8855536780438713 

- Remove correlated variables
Mean Squared Error: 0.13730048632566919
R-squared Score: 0.8633275279059133

- Add ethnicity and religion
R-squared Score: 0.8138255990869248
Mean Squared Error: 0.18874360921627956
Mean Absolute Error: 0.3273777320063639
"""

Best Lasso Alpha:  0.001
Model Coefficients: [ 0.19727525  0.73962783  0.42736761 -0.08120503  0.00828636 -0.01898399
  0.03329117  0.22309096  0.13290406  0.03324895  0.00887151 -0.01549724
 -0.02314507 -0.02322047 -0.05769485  0.          0.         -0.
 -0.         -0.          0.         -0.         -0.          0.
 -0.          0.          0.5017501   0.16707672  0.09120191 -0.00306541
 -0.02511385  0.         -0.          0.         -0.16555428 -0.10346898
  0.02189891  0.          0.11522923 -0.04797274 -0.07575438 -0.02130215
 -0.16665414 -0.30781545 -0.35606383 -0.66161145 -0.62545877 -0.39373447
 -0.07315511 -0.19510635 -0.42347356]
Intercept: 0.14565977976953431
R-squared Score: 0.8138255990869248
Mean Squared Error: 0.18874360921627956
Mean Absolute Error: 0.3273777320063639


'\nMean Squared Error: 0.11497220634118055\nR-squared Score: 0.8855536780438713 \n\n- Remove correlated variables\nMean Squared Error: 0.13730048632566919\nR-squared Score: 0.8633275279059133\n\n- Add ethnicity and religion\nMean Squared Error: 0.18874360921627956\nR-squared Score: 0.8138255990869248\n'

Test for Interaction Effect - make no sense for location-related variables

In [None]:
# Create all pairwise interaction terms (without squared terms)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Use LassoCV for automatic feature selection with regularization
model = make_pipeline(poly, LassoCV(cv=5)).fit(X, y)

# Get coefficients
lasso = model.named_steps['lassocv']
feature_names = model.named_steps['polynomialfeatures'].get_feature_names_out(X.columns)


coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso.coef_})
selected_features = coef_df[coef_df['Coefficient'] != 0]

print(selected_features)

                                              Feature  Coefficient
0                                        storey_range     0.149198
1                                      floor_area_sqm     0.697456
2                                     remaining_lease     0.373798
3                                nearest_mrt_distance    -0.060381
4                                nearest_bus_distance     0.002442
..                                                ...          ...
605  inflation_rate (x100) resident_unemployment_rate     0.003342
606               inflation_rate (x100) interest_rate    -0.004584
655                resident_unemployment_rate fx_rate    -0.015289
656   resident_unemployment_rate avg_household_income     0.020327
749                      fx_rate avg_household_income     0.002210

[99 rows x 2 columns]


# Ridge

In [12]:
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')
ridge_cv.fit(X_train, y_train)
best_ridge = ridge_cv.best_estimator_

y_pred = best_ridge.predict(X_test)

mse_r = mean_squared_error(y_test, y_pred)
r2_r = r2_score(y_test, y_pred)
mae_r = mean_absolute_error(y_test, y_pred)

In [None]:
print("Best Ridge Alpha: ", ridge_cv.best_params_['alpha'])
print("Model Coefficients:", best_ridge.coef_)
print("Intercept:", best_ridge.intercept_)


print("R-squared Score:", r2_r)
print("Mean Squared Error:", mse_r)
print("Mean Absolute Error:", mae_r)

"""Amenities_data_cleaning.ipynbIntercept: 0.15030498871214715
Mean Squared Error: 0.10682822058977902
R-squared Score: 0.893660413097256

Mean Squared Error: 0.1341701364928894
R-squared Score: 0.8664435594773567

R-squared Score: 0.8206842557831927
Mean Squared Error: 0.1817903029997421
Mean Absolute Error: 0.3217016858124708
"""

Best Ridge Alpha:  0.0021209508879201904
Model Coefficients: [ 1.88220366e-01  7.36922651e-01  4.37386477e-01 -9.23710880e-02
  8.27600031e-03  1.99758298e-02  7.20271113e-03  1.41711178e-01
  1.50509167e-01  3.00859612e-02  8.75825913e-03 -1.98568514e-02
 -2.66536695e-02 -2.41376641e-02 -6.23572556e-02 -3.36635397e+00
 -2.32507882e+00 -3.62266231e+00 -4.18117703e+00 -1.03302270e+01
 -5.81718207e+00 -4.98592790e-01 -8.18255634e+00  3.30661051e+00
  2.22361490e+00  5.82495402e+00  3.07869895e-01 -3.64832021e-01
 -2.16542899e-01 -5.26256561e-01 -4.22559627e-01 -7.87811358e-01
 -3.86605851e-01 -3.13898212e-01 -9.22986791e-01 -7.40683169e-01
 -4.31848336e-01 -4.37706016e-01 -2.43375983e-01 -5.43279714e-01
 -8.85568824e-01 -4.78737559e-01 -6.92967044e-01 -1.09383182e+00
 -8.74634992e-01 -1.19699397e+00 -1.35932442e+00 -7.75057863e-01
 -7.15709317e-01 -4.14811160e-01 -7.54957144e-01]
Intercept: -0.13183709786631248
R-squared Score: 0.8206842557831927
Mean Squared Error: 0.1817903029997421
Me

'Amenities_data_cleaning.ipynbIntercept: 0.15030498871214715\nMean Squared Error: 0.10682822058977902\nR-squared Score: 0.893660413097256\n\nMean Squared Error: 0.1341701364928894\nR-squared Score: 0.8664435594773567\n\nMean Squared Error: 0.1817903029997421\nR-squared Score: 0.8206842557831927\n'

Check for Multicollinearity

In [31]:
from statsmodels.stats.outliers_influence import variance_inflation_factor # type: ignore

# Compute VIF for each feature
X_train_vif = X_train.copy()  # Ensure we work with a copy
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_vif.values, i) for i in range(X_train_vif.shape[1])]

vif_data

Unnamed: 0,Feature,VIF
0,storey_range,1.24079
1,floor_area_sqm,1.181057
2,remaining_lease,1.779705
3,nearest_mrt_distance,1.890704
4,nearest_bus_distance,1.053948
5,education_score,2.389122
6,shopping_score,2.534772
7,food_score,5.119843
8,recreation_score,3.062364
9,healthcare_score,6.323754


In [58]:
vif_data

Unnamed: 0,Feature,VIF
0,flat_type,21.410640
1,storey_range,1.268476
2,floor_area_sqm,15.247786
3,remaining_lease,316.782651
4,nearest_mrt_distance,1.933301
...,...,...
58,flat_model_Simplified,3.268270
59,flat_model_Standard,2.836043
60,flat_model_Terrace,1.082173
61,flat_model_Type S1,1.445011


Check for Interactive Effect

In [55]:
correlations = X_train.corrwith(y_train)
print(correlations.abs().sort_values(ascending=False))

floor_area_sqm          0.646445
flat_type               0.640392
storey_range            0.372191
building_age_2025       0.350866
remaining_lease         0.350195
                          ...   
town_GEYLANG            0.015010
flat_model_3Gen         0.010742
town_HOUGANG            0.008931
town_SENGKANG           0.005379
avg_household_income    0.003691
Length: 63, dtype: float64


In [56]:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

# Create polynomial feature transformer (degree=2 for interaction terms)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Transform the dataset
X_train_interactions = poly.fit_transform(X_train)

# Create feature names
feature_names = poly.get_feature_names_out(X_train.columns)

# Convert back to DataFrame
X_train_interactions = pd.DataFrame(X_train_interactions, columns=feature_names)


In [54]:
# Compute correlation of interaction terms with target
interaction_correlations = X_train_interactions.corrwith(y_train)

# Sort by absolute correlation value
strong_interactions = interaction_correlations.abs().sort_values(ascending=False)

# Display top 10 most relevant interaction terms
print("Top 10 Strongest Interaction Terms:")
print(strong_interactions.head(10))


Top 10 Strongest Interaction Terms:
storey_range town_CHOA CHU KANG              0.011254
town_BISHAN flat_model_Adjoined flat         0.009512
remaining_lease nearest_bus_distance         0.008969
nearest_bus_distance building_age_2025       0.008896
education_score healthcare_score             0.008872
total_unemployment_rate town_TAMPINES        0.008850
resident_unemployment_rate town_TAMPINES     0.008792
nearest_bus_distance town_PASIR RIS          0.008695
floor_area_sqm fx_rate                       0.008174
town_QUEENSTOWN flat_model_New Generation    0.007437
dtype: float64
