In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

df = pd.read_csv('/Users/erica/Desktop/Y3S2/DSE3101/Local/reslae_price_normalized_for_ML.csv')

In [62]:
df = df.drop(columns=['flat_model', 'building_age_2025', 'total_unemployment_rate', 
                      'Chinese', 'Malays', 'Indians', 'Others', 'fx_rate', 'floor_area_sqm'])
df_normalized_clean = df.copy()

def normalize(col):
    return (col - col.min()) / (col.max() - col.min()) if col.max() != col.min() else col

df_normalized_clean['month'] = pd.to_datetime(df_normalized_clean['month'])
df_normalized_clean['year'] = df_normalized_clean['month'].dt.year.astype(float)
df_normalized_clean['month_num'] = df_normalized_clean['month'].dt.month.astype(float)
df_normalized_clean = df_normalized_clean.drop(columns=['month'])
df_normalized_clean = df_normalized_clean.drop(columns = ['CPI (base 2024-12)'])

columns_to_normalize = ['inflation_rate (x100)', 'interest_rate', 'priv_prop',
                        'resident_unemployment_rate', 'month_num', 'year']

df_normalized_clean[columns_to_normalize] = df_normalized_clean[columns_to_normalize].apply(normalize)
df_normalized_clean.dropna(subset = ['resale_price'], inplace = True)
categorical_features = ['town']
numerical_features = [
    'storey_range', 'remaining_lease',
    'lat', 'lon', 'nearest_mrt_distance', 'nearest_bus_distance',
    'education_score', 'shopping_score', 'food_score', 'recreation_score',
    'healthcare_score', 'inflation_rate (x100)',
    'resident_unemployment_rate',
    'interest_rate', 'avg_household_income', 'priv_prop', 'flat_type'
]
numerical_features.extend(['year', 'month_num'])
demographic_features = [
    'NoReligion', 'Buddhism', 'Taoism1', 'Islam', 'Hinduism', 'Sikhism',
    'Christianity_Catholic', 'Christianity_OtherChristians', 'OtherReligions'
]
numerical_features.extend(demographic_features)  # Add to numerical pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [63]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [64]:
df_now = df_normalized_clean.copy()

In [65]:
dec_2024_values = df_now[(df_now['year'] == 1.0) & (df_now['month_num'] == 1.0)].iloc[0]
df_now['inflation_rate (x100)'] = dec_2024_values['inflation_rate (x100)']
df_now['resident_unemployment_rate'] = dec_2024_values['resident_unemployment_rate']
df_now['interest_rate'] = dec_2024_values['interest_rate']
df_now['avg_household_income'] = dec_2024_values['avg_household_income']
df_now['year'] = dec_2024_values['year']
df_now['month_num'] = dec_2024_values['month_num']

In [66]:
X = df_normalized_clean.drop(columns=['resale_price'])
y = df_normalized_clean['resale_price']
model.fit(X, y)

In [67]:
X_now = df_now.drop(columns=['resale_price'])
df_now['predicted_resale_price'] = model.predict(X_now)

In [68]:
df_original = pd.read_csv('../data/cleaned/resale_price_cleaned.csv')
df_original = df_original.dropna(subset=['resale_price'])

In [70]:
original_max = df_original['resale_price'].max()
original_min = df_original['resale_price'].min()
df_now['prediction_reverted'] = df_now['predicted_resale_price'] * (original_max - original_min) + original_min

In [71]:
df_now

Unnamed: 0,town,flat_type,storey_range,remaining_lease,resale_price,lat,lon,nearest_mrt_distance,nearest_bus_distance,education_score,...,Hinduism,Sikhism,Christianity_Catholic,Christianity_OtherChristians,OtherReligions,priv_prop,year,month_num,predicted_resale_price,prediction_reverted
0,BUKIT MERAH,0.333333,0.2500,0.312121,0.131278,0.000000,0.456109,0.159033,0.226134,0.197110,...,0.038491,0.006792,0.062642,0.123019,0.003774,0.583962,1.0,1.0,0.218387,469600.733240
1,BUKIT MERAH,0.333333,0.1250,0.309346,0.131197,0.000000,0.456109,0.159033,0.226134,0.197110,...,0.038491,0.006792,0.062642,0.123019,0.003774,0.615417,1.0,1.0,0.215410,465276.784001
2,BUKIT MERAH,0.333333,0.0625,0.312121,0.127206,0.002888,0.454289,0.170487,0.124639,0.214251,...,0.038491,0.006792,0.062642,0.123019,0.003774,0.583962,1.0,1.0,0.198436,440624.412133
3,BUKIT MERAH,0.333333,0.1875,0.299116,0.171393,0.002888,0.454289,0.170487,0.124639,0.214251,...,0.038491,0.006792,0.062642,0.123019,0.003774,0.592284,1.0,1.0,0.219693,471497.528461
4,BUKIT MERAH,0.333333,0.1250,0.310733,0.190853,0.005513,0.415298,0.035276,0.242965,0.144639,...,0.038491,0.006792,0.062642,0.123019,0.003774,0.583962,1.0,1.0,0.203913,448578.548316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196981,SEMBAWANG,0.666667,0.0625,0.663256,0.263095,1.000000,0.429908,0.288495,0.270891,0.382806,...,0.053553,0.002490,0.054746,0.094928,0.002661,0.353128,1.0,1.0,0.270559,545377.700730
196982,SEMBAWANG,0.666667,0.1250,0.658921,0.277557,1.000000,0.429908,0.288495,0.270891,0.382806,...,0.053553,0.002490,0.054746,0.094928,0.002661,0.686015,1.0,1.0,0.290812,574792.199009
196983,SEMBAWANG,0.500000,0.2500,0.657534,0.240401,1.000000,0.429908,0.288495,0.270891,0.382806,...,0.053553,0.002490,0.054746,0.094928,0.002661,0.686015,1.0,1.0,0.255783,523915.624242
196984,SEMBAWANG,0.666667,0.2500,0.654586,0.329079,1.000000,0.429908,0.288495,0.270891,0.382806,...,0.053553,0.002490,0.054746,0.094928,0.002661,0.519712,1.0,1.0,0.308353,600269.439104
