In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

df = pd.read_csv('/Users/erica/Desktop/Y3S2/DSE3101/Local/reslae_price_normalized_for_ML.csv')

df = df.drop(columns=['flat_model', 'building_age_2025', 'total_unemployment_rate', 
                      'Chinese', 'Malays', 'Indians', 'Others', 'fx_rate', 'floor_area_sqm'])
df_normalized_clean = df.copy()
def normalize(col):
    return (col - col.min()) / (col.max() - col.min()) if col.max() != col.min() else col

df_normalized_clean['month'] = pd.to_datetime(df_normalized_clean['month'])
df_normalized_clean['year'] = df_normalized_clean['month'].dt.year.astype(float)
df_normalized_clean['month_num'] = df_normalized_clean['month'].dt.month.astype(float)
df_normalized_clean = df_normalized_clean.drop(columns=['month'])
df_normalized_clean = df_normalized_clean.drop(columns = ['CPI (base 2024-12)'])

columns_to_normalize = ['inflation_rate (x100)', 'interest_rate', 'priv_prop',
                        'resident_unemployment_rate', 'month_num', 'year']

df_normalized_clean[columns_to_normalize] = df_normalized_clean[columns_to_normalize].apply(normalize)

categorical_features = ['town']
numerical_features = [
    'storey_range', 'remaining_lease',
    'lat', 'lon', 'nearest_mrt_distance', 'nearest_bus_distance',
    'education_score', 'shopping_score', 'food_score', 'recreation_score',
    'healthcare_score', 'inflation_rate (x100)',
    'resident_unemployment_rate',
    'interest_rate', 'avg_household_income', 'priv_prop', 'flat_type'
]
numerical_features.extend(['year', 'month_num'])
demographic_features = [
    'NoReligion', 'Buddhism', 'Taoism1', 'Islam', 'Hinduism', 'Sikhism',
    'Christianity_Catholic', 'Christianity_OtherChristians', 'OtherReligions'
]
numerical_features.extend(demographic_features)  # Add to numerical pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [2]:
df_sample = df_normalized_clean.sample(frac=0.2, random_state=42)

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid to search
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20, 30],
    'regressor__min_samples_leaf': [1, 2, 4]
    }

grid_search = GridSearchCV(
    model,
    param_grid,
    cv=3,
    scoring={
        'MAE': 'neg_mean_absolute_error',
        'RMSE': 'neg_root_mean_squared_error',
        'R2': 'r2'
    },
    refit='MAE',  # Use MAE to choose the best model
    n_jobs=-1,
    verbose=2
)

# Split your data
X = df_sample.drop(columns=['resale_price'])
y = df_sample['resale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV MAE:", -grid_search.cv_results_['mean_test_MAE'][grid_search.best_index_])
print("Best CV RMSE:", -grid_search.cv_results_['mean_test_RMSE'][grid_search.best_index_])
print("Best CV R2:", grid_search.cv_results_['mean_test_R2'][grid_search.best_index_])


Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100; total time=  15.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100; total time=  15.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100; total time=  15.1s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=2, regressor__n_estimators=100; total time=  15.1s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=2, regressor__n_estimators=100; total time=  15.2s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=2, regressor__n_estimators=100; total time=  15.0s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=200; total time=  30.1s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=200; total time=  30.2s
[CV] END regressor__max_depth=10, r

In [2]:
best_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        max_depth=30,
        min_samples_leaf=1,
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

In [3]:
import joblib
X_full = df_normalized_clean.drop(columns=['resale_price'])
y_full = df_normalized_clean['resale_price']

In [4]:
best_model.fit(X_full, y_full)
#joblib.dump(best_model, 'best_resale_price_model.pkl')

KeyboardInterrupt: 

In [13]:
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

# Define custom RMSE scorer
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def safe_mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# Scoring dictionary
scoring = {
    'r2': 'r2',
    'mae': make_scorer(mean_absolute_error),
    'rmse': make_scorer(rmse),
    'mape': make_scorer(safe_mape)
}

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)


# 5-Fold CV
cv = KFold(n_splits=5, shuffle=True, random_state=42)


In [14]:
# Run cross-validation
cv_results = cross_validate(best_model, X_train, y_train, scoring=scoring, cv=cv)

# Print average scores
print(f"Average R²:   {np.mean(cv_results['test_r2']):.4f}")
print(f"Average MAE:  {np.mean(cv_results['test_mae']):.4f}")
print(f"Average RMSE: {np.mean(cv_results['test_rmse']):.4f}")
print(f"Average MAPE: {np.mean(cv_results['test_mape']):.4f}")


Average R²:   0.9670
Average MAE:  0.0164
Average RMSE: 0.0235
Average MAPE: 6.7128


In [15]:
best_model.fit(X_train, y_train)
y_pred_test = best_model.predict(X_test)

print("\n🧪 Final Evaluation on Held-Out Test Set:")
print(f"  R²:   {r2_score(y_test, y_pred_test):.4f}")
print(f"  MAE:  {mean_absolute_error(y_test, y_pred_test):.4f}")
print(f"  RMSE: {rmse(y_test, y_pred_test):.4f}")
print(f"  MAPE: {safe_mape(y_test, y_pred_test):.4f}")


🧪 Final Evaluation on Held-Out Test Set:
  R²:   0.9681
  MAE:  0.0161
  RMSE: 0.0230
  MAPE: 6.4150


In [5]:
bestmodel = joblib.load('/Users/erica/Desktop/Y3S2/DSE3101/Local/best_resale_price_model.pkl')

In [10]:
print(bestmodel.named_steps)

{'preprocessor': ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['storey_range', 'remaining_lease', 'lat',
                                  'lon', 'nearest_mrt_distance',
                                  'nearest_bus_distance', 'education_score',
                                  'shopping_score', 'food_score',
                                  'recreation_score', 'healthcare_score',
                                  'inflation_rate (x100)',
                                  'resident_unemployment_rate', 'interest_rate',
                                  'avg_household_income', 'priv_prop',
                                  'flat_type', 'year', 'month_num',
                                  'NoReligion', 'Buddhism', 'Taoism1', 'Islam',
                                  'Hinduism', 'Sikhism',
                                  'Christianity_Catholic',
                                  'Christianity_OtherChristians',
                          

In [11]:
importances = bestmodel.named_steps['regressor'].feature_importances_

# If using column transformer in the preprocessor:
feature_names = bestmodel.named_steps['preprocessor'].get_feature_names_out()

import pandas as pd
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(importance_df)


                              feature  importance
16                     num__flat_type    0.455028
1                num__remaining_lease    0.137592
8                     num__food_score    0.104805
19                    num__NoReligion    0.059625
17                          num__year    0.037338
26  num__Christianity_OtherChristians    0.032084
2                            num__lat    0.030482
0                   num__storey_range    0.025742
20                      num__Buddhism    0.016226
4           num__nearest_mrt_distance    0.015881
3                            num__lon    0.014628
9               num__recreation_score    0.010955
10              num__healthcare_score    0.009691
7                 num__shopping_score    0.007427
6                num__education_score    0.005813
13                 num__interest_rate    0.005541
25         num__Christianity_Catholic    0.003776
35             cat__town_CENTRAL AREA    0.003488
5           num__nearest_bus_distance    0.003128
