In [1]:
import pandas as pd # type: ignore
import numpy as np # type: ignore

from sklearn.preprocessing import PolynomialFeatures # type: ignore
from sklearn.model_selection import train_test_split, GridSearchCV # type: ignore
from sklearn.linear_model import LinearRegression, Lasso, Ridge # type: ignore
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # type: ignore
from sklearn.pipeline import make_pipeline # type: ignore
from sklearn.linear_model import LassoCV, RidgeCV # type: ignore

import matplotlib.pyplot as plt # type: ignore

### Import data

In [2]:
df_resale = pd.read_csv('../data/cleaned/standardized_encoded_delfated_resale_price.csv')
df_resale = df_resale.dropna()

In [3]:
df_resale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196986 entries, 0 to 196985
Data columns (total 58 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   month                         196986 non-null  int64  
 1   storey_range                  196986 non-null  float64
 2   floor_area_sqm                196986 non-null  float64
 3   remaining_lease               196986 non-null  float64
 4   resale_price                  196986 non-null  float64
 5   lat                           196986 non-null  float64
 6   lon                           196986 non-null  float64
 7   nearest_mrt_distance          196986 non-null  float64
 8   nearest_bus_distance          196986 non-null  float64
 9   education_score               196986 non-null  float64
 10  shopping_score                196986 non-null  float64
 11  food_score                    196986 non-null  float64
 12  recreation_score              196986 non-nul

Select features

In [4]:
feature_names = df_resale.columns.tolist()
exclude_cols = ['resale_price']  # Replace with actual column names
features_selected = [col for col in df_resale.columns if col not in exclude_cols]
X = df_resale[features_selected]
y = df_resale['resale_price']

Split data into training and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39398 entries, 64633 to 15286
Data columns (total 57 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   month                         39398 non-null  int64  
 1   storey_range                  39398 non-null  float64
 2   floor_area_sqm                39398 non-null  float64
 3   remaining_lease               39398 non-null  float64
 4   lat                           39398 non-null  float64
 5   lon                           39398 non-null  float64
 6   nearest_mrt_distance          39398 non-null  float64
 7   nearest_bus_distance          39398 non-null  float64
 8   education_score               39398 non-null  float64
 9   shopping_score                39398 non-null  float64
 10  food_score                    39398 non-null  float64
 11  recreation_score              39398 non-null  float64
 12  healthcare_score              39398 non-null  float64
 1

## Test Variable Significance

In [None]:
import statsmodels.api as sm

X_sm = sm.add_constant(X)  # manually add intercept
model_sm = sm.OLS(y, X_sm).fit()
print(model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:           resale_price   R-squared:                       0.875
Model:                            OLS   Adj. R-squared:                  0.875
Method:                 Least Squares   F-statistic:                 2.417e+04
Date:                Thu, 03 Apr 2025   Prob (F-statistic):               0.00
Time:                        15:01:48   Log-Likelihood:                -74764.
No. Observations:              196986   AIC:                         1.496e+05
Df Residuals:                  196928   BIC:                         1.502e+05
Df Model:                          57                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

# MLR

In [31]:
# initialize and fit model

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# evaluation
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print("Model Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

print("R-squared Score:", r2)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

"""Intercept: 0.29854266905633947
Mean Squared Error: 0.134170112966285
R-squared Score: 0.8664435828963502

R-squared Score: 0.820683629869053
Mean Squared Error: 0.18179093755150225
Mean Absolute Error: 0.32170451015806445

R-squared Score: 0.8743349394773806
Mean Squared Error: 0.1251134439544029
Mean Absolute Error: 0.2678552585899168
"""

Model Coefficients: [ 1.16596772e-02  1.70342935e-01  7.20685596e-01  4.67390551e-01
 -6.45663151e+00 -1.42767564e-01 -1.33725822e-01  6.31466040e-04
 -2.02302930e-02  1.78038843e-02  5.06194854e-02  5.93971016e-02
  1.21066624e-01 -8.27250037e-03 -1.00941436e-01 -5.92722059e-02
 -1.33900508e-03 -4.55984405e-02  1.01287847e-01  5.25472547e-01
  6.72452373e-01  3.31858492e-01  1.46359622e+00  2.06220017e-01
  4.21703669e-02  4.08765741e-01  6.92628078e-01  2.07218183e-02
 -2.15482739e-01 -1.84544143e-01 -3.42715875e-02 -2.44985961e-02
 -3.95797836e-01  7.11998290e-01 -5.02307337e-01 -2.17679414e-01
 -7.95751179e-01  1.38953810e+00 -3.12853776e-01 -8.66569527e-01
  9.05597155e-03 -4.55510119e-01 -5.11702227e-01 -5.18877908e-01
 -1.11904643e+00 -3.42566956e-01  1.13459988e+00 -4.27516484e-01
 -7.13999109e-01  2.88603417e-01 -5.04958757e-01 -1.04810700e+00
  2.87590195e-01 -4.24042469e-01  8.99076786e-02 -5.97819018e-01
 -1.42959719e-01]
Intercept: -180.63420747358478
R-squared Score: 0.87

'Intercept: 0.29854266905633947\nMean Squared Error: 0.134170112966285\nR-squared Score: 0.8664435828963502\n\nR-squared Score: 0.820683629869053\nMean Squared Error: 0.18179093755150225\nMean Absolute Error: 0.32170451015806445\n'

# Lasso

In [8]:
param_grid = {'alpha': np.logspace(-3, 1, 50)}

In [9]:
lasso = Lasso()
lasso_cv = GridSearchCV(lasso, param_grid, cv=5, scoring='r2')
lasso_cv.fit(X_train, y_train)

best_lasso = lasso_cv.best_estimator_

y_pred = best_lasso.predict(X_test)

mse_l = mean_squared_error(y_test, y_pred)
r2_l = r2_score(y_test, y_pred)
mae_l = mean_absolute_error(y_test, y_pred)

In [10]:
coef_df = pd.DataFrame({
    'Variable': X.columns,
    'Coefficient': best_lasso.coef_
})
print(coef_df)

                        Variable  Coefficient
0                          month     0.011219
1                   storey_range     0.177244
2                 floor_area_sqm     0.719895
3                remaining_lease     0.454567
4                            lat    -0.000000
5                            lon     0.000000
6           nearest_mrt_distance    -0.127188
7           nearest_bus_distance     0.002363
8                education_score    -0.053099
9                 shopping_score     0.029087
10                    food_score     0.125257
11              recreation_score     0.086317
12              healthcare_score     0.120994
13         inflation_rate (x100)    -0.005082
14    resident_unemployment_rate    -0.093861
15                 interest_rate    -0.054815
16                       fx_rate    -0.000245
17          avg_household_income    -0.037628
18                          year     0.102623
19                    NoReligion     0.026525
20                      Buddhism  

In [None]:
print("Best Lasso Alpha: ", lasso_cv.best_params_['alpha'])
print("Intercept:", best_lasso.intercept_)

print("R-squared Score:", r2_l)
print("Mean Squared Error:", mse_l)
print("Mean Absolute Error:", mae_l)


"""
Mean Squared Error: 0.11497220634118055
R-squared Score: 0.8855536780438713 

- Remove correlated variables
Mean Squared Error: 0.13730048632566919
R-squared Score: 0.8633275279059133

- Add ethnicity and religion
R-squared Score: 0.8138255990869248
Mean Squared Error: 0.18874360921627956
Mean Absolute Error: 0.3273777320063639

R-squared Score: 0.8692630825073766
Mean Squared Error: 0.13016303761331124
Mean Absolute Error: 0.2711787729697815

"""

Best Lasso Alpha:  0.001
Model Coefficients: [ 1.12191233e-02  1.77244369e-01  7.19895256e-01  4.54567486e-01
 -0.00000000e+00  0.00000000e+00 -1.27187889e-01  2.36262825e-03
 -5.30993030e-02  2.90868176e-02  1.25257153e-01  8.63166223e-02
  1.20994147e-01 -5.08196914e-03 -9.38611324e-02 -5.48154967e-02
 -2.44824589e-04 -3.76280285e-02  1.02622613e-01  2.65254679e-02
  0.00000000e+00 -1.42033619e-02 -6.69870715e-02 -5.72019753e-03
  2.74181637e-03  1.09840344e-02  1.87127717e-01  1.57656260e-02
  0.00000000e+00 -1.73959688e-02  1.19560509e-02 -3.46547083e-03
 -0.00000000e+00  0.00000000e+00 -1.41673141e-01  6.60176577e-02
 -3.41174426e-01 -0.00000000e+00  3.67830803e-02 -2.40005528e-01
  3.48105627e-02  0.00000000e+00 -3.00272553e-01  1.94380338e-01
 -4.27860279e-02  8.63706565e-02  4.09088085e-01 -1.16632063e-01
 -5.13346224e-01  9.57448647e-02 -4.68723377e-01 -7.64626325e-01
 -1.36731865e-01  3.69877458e-01  4.67422222e-02 -0.00000000e+00
  2.44093759e-01]
Intercept: -207.33825763865

'\nMean Squared Error: 0.11497220634118055\nR-squared Score: 0.8855536780438713 \n\n- Remove correlated variables\nMean Squared Error: 0.13730048632566919\nR-squared Score: 0.8633275279059133\n\n- Add ethnicity and religion\nR-squared Score: 0.8138255990869248\nMean Squared Error: 0.18874360921627956\nMean Absolute Error: 0.3273777320063639\n'

Test for Interaction Effect - make no sense for location-related variables

In [None]:
# Create all pairwise interaction terms (without squared terms)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Use LassoCV for automatic feature selection with regularization
model = make_pipeline(poly, LassoCV(cv=5)).fit(X, y)

# Get coefficients
lasso = model.named_steps['lassocv']
feature_names = model.named_steps['polynomialfeatures'].get_feature_names_out(X.columns)


coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso.coef_})
selected_features = coef_df[coef_df['Coefficient'] != 0]

print(selected_features)

                                              Feature  Coefficient
0                                        storey_range     0.149198
1                                      floor_area_sqm     0.697456
2                                     remaining_lease     0.373798
3                                nearest_mrt_distance    -0.060381
4                                nearest_bus_distance     0.002442
..                                                ...          ...
605  inflation_rate (x100) resident_unemployment_rate     0.003342
606               inflation_rate (x100) interest_rate    -0.004584
655                resident_unemployment_rate fx_rate    -0.015289
656   resident_unemployment_rate avg_household_income     0.020327
749                      fx_rate avg_household_income     0.002210

[99 rows x 2 columns]


# Ridge

In [36]:
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')
ridge_cv.fit(X_train, y_train)
best_ridge = ridge_cv.best_estimator_

y_pred = best_ridge.predict(X_test)

mse_r = mean_squared_error(y_test, y_pred)
r2_r = r2_score(y_test, y_pred)
mae_r = mean_absolute_error(y_test, y_pred)

In [None]:
print("Best Ridge Alpha: ", ridge_cv.best_params_['alpha'])
print("Model Coefficients:", best_ridge.coef_)
print("Intercept:", best_ridge.intercept_)


print("R-squared Score:", r2_r)
print("Mean Squared Error:", mse_r)
print("Mean Absolute Error:", mae_r)

"""Amenities_data_cleaning.ipynbIntercept: 0.15030498871214715
Mean Squared Error: 0.10682822058977902
R-squared Score: 0.893660413097256

Mean Squared Error: 0.1341701364928894
R-squared Score: 0.8664435594773567

R-squared Score: 0.8206842557831927
Mean Squared Error: 0.1817903029997421
Mean Absolute Error: 0.3217016858124708

R-squared Score: 0.8743329781471458
Mean Squared Error: 0.1251153966752259
Mean Absolute Error: 0.2678416516290374
"""

Best Ridge Alpha:  0.0625055192527397
Model Coefficients: [ 1.16599177e-02  1.70352412e-01  7.20694547e-01  4.67393007e-01
 -6.39314301e+00 -1.32737702e-01 -1.33770426e-01  6.20710964e-04
 -2.03488583e-02  1.77754895e-02  5.07254178e-02  5.95458837e-02
  1.20990245e-01 -8.26901433e-03 -1.00941493e-01 -5.92835977e-02
 -1.33496468e-03 -4.55763237e-02  1.01296954e-01  4.71037671e-01
  6.09255157e-01  2.99444041e-01  1.32638930e+00  1.82681684e-01
  3.91805359e-02  3.68550448e-01  6.24342614e-01  1.89113470e-02
 -7.17437080e-02 -5.63820909e-02 -6.84442313e-03 -8.75769746e-03
 -3.92900413e-01  7.05950112e-01 -4.97117974e-01 -2.10509734e-01
 -7.92016310e-01  1.37560521e+00 -3.04945790e-01 -8.60579894e-01
  1.24114865e-02 -4.49653094e-01 -5.10863975e-01 -5.08765692e-01
 -1.10707726e+00 -3.36244497e-01  1.13055178e+00 -4.27306457e-01
 -7.13466566e-01  2.90232497e-01 -5.04120513e-01 -1.04561725e+00
  2.83633808e-01 -4.18728298e-01  9.01682782e-02 -5.92165990e-01
 -1.39131244e-01]
Intercept: -18

'Amenities_data_cleaning.ipynbIntercept: 0.15030498871214715\nMean Squared Error: 0.10682822058977902\nR-squared Score: 0.893660413097256\n\nMean Squared Error: 0.1341701364928894\nR-squared Score: 0.8664435594773567\n\nR-squared Score: 0.8206842557831927\nMean Squared Error: 0.1817903029997421\nMean Absolute Error: 0.3217016858124708\n'

Check for Multicollinearity

In [31]:
from statsmodels.stats.outliers_influence import variance_inflation_factor # type: ignore

# Compute VIF for each feature
X_train_vif = X_train.copy()  # Ensure we work with a copy
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_vif.values, i) for i in range(X_train_vif.shape[1])]

vif_data

Unnamed: 0,Feature,VIF
0,storey_range,1.24079
1,floor_area_sqm,1.181057
2,remaining_lease,1.779705
3,nearest_mrt_distance,1.890704
4,nearest_bus_distance,1.053948
5,education_score,2.389122
6,shopping_score,2.534772
7,food_score,5.119843
8,recreation_score,3.062364
9,healthcare_score,6.323754


In [58]:
vif_data

Unnamed: 0,Feature,VIF
0,flat_type,21.410640
1,storey_range,1.268476
2,floor_area_sqm,15.247786
3,remaining_lease,316.782651
4,nearest_mrt_distance,1.933301
...,...,...
58,flat_model_Simplified,3.268270
59,flat_model_Standard,2.836043
60,flat_model_Terrace,1.082173
61,flat_model_Type S1,1.445011


Check for Interactive Effect

In [55]:
correlations = X_train.corrwith(y_train)
print(correlations.abs().sort_values(ascending=False))

floor_area_sqm          0.646445
flat_type               0.640392
storey_range            0.372191
building_age_2025       0.350866
remaining_lease         0.350195
                          ...   
town_GEYLANG            0.015010
flat_model_3Gen         0.010742
town_HOUGANG            0.008931
town_SENGKANG           0.005379
avg_household_income    0.003691
Length: 63, dtype: float64


In [56]:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

# Create polynomial feature transformer (degree=2 for interaction terms)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Transform the dataset
X_train_interactions = poly.fit_transform(X_train)

# Create feature names
feature_names = poly.get_feature_names_out(X_train.columns)

# Convert back to DataFrame
X_train_interactions = pd.DataFrame(X_train_interactions, columns=feature_names)


In [54]:
# Compute correlation of interaction terms with target
interaction_correlations = X_train_interactions.corrwith(y_train)

# Sort by absolute correlation value
strong_interactions = interaction_correlations.abs().sort_values(ascending=False)

# Display top 10 most relevant interaction terms
print("Top 10 Strongest Interaction Terms:")
print(strong_interactions.head(10))


Top 10 Strongest Interaction Terms:
storey_range town_CHOA CHU KANG              0.011254
town_BISHAN flat_model_Adjoined flat         0.009512
remaining_lease nearest_bus_distance         0.008969
nearest_bus_distance building_age_2025       0.008896
education_score healthcare_score             0.008872
total_unemployment_rate town_TAMPINES        0.008850
resident_unemployment_rate town_TAMPINES     0.008792
nearest_bus_distance town_PASIR RIS          0.008695
floor_area_sqm fx_rate                       0.008174
town_QUEENSTOWN flat_model_New Generation    0.007437
dtype: float64
