In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

df = pd.read_csv('/Users/erica/Desktop/Y3S2/DSE3101/Local/reslae_price_normalized_for_ML.csv')

In [2]:
df = df.drop(columns=['flat_model', 'building_age_2025', 'total_unemployment_rate', 
                      'Chinese', 'Malays', 'Indians', 'Others', 'fx_rate', 'floor_area_sqm'])
df_normalized_clean = df.copy()

def normalize(col):
    return (col - col.min()) / (col.max() - col.min()) if col.max() != col.min() else col

df_normalized_clean['month'] = pd.to_datetime(df_normalized_clean['month'])
df_normalized_clean['year'] = df_normalized_clean['month'].dt.year.astype(float)
df_normalized_clean['month_num'] = df_normalized_clean['month'].dt.month.astype(float)
df_normalized_clean = df_normalized_clean.drop(columns=['month'])
df_normalized_clean = df_normalized_clean.drop(columns = ['CPI (base 2024-12)'])

columns_to_normalize = ['inflation_rate (x100)', 'interest_rate', 'priv_prop',
                        'resident_unemployment_rate', 'month_num', 'year']

df_normalized_clean[columns_to_normalize] = df_normalized_clean[columns_to_normalize].apply(normalize)


categorical_features = ['town']
numerical_features = [
    'storey_range', 'remaining_lease',
    'lat', 'lon', 'nearest_mrt_distance', 'nearest_bus_distance',
    'education_score', 'shopping_score', 'food_score', 'recreation_score',
    'healthcare_score', 'inflation_rate (x100)',
    'resident_unemployment_rate',
    'interest_rate', 'avg_household_income', 'priv_prop', 'flat_type'
]
numerical_features.extend(['year', 'month_num'])
demographic_features = [
    'NoReligion', 'Buddhism', 'Taoism1', 'Islam', 'Hinduism', 'Sikhism',
    'Christianity_Catholic', 'Christianity_OtherChristians', 'OtherReligions'
]
numerical_features.extend(demographic_features)  # Add to numerical pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [3]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [4]:
df_now = df_normalized_clean.copy()

In [5]:
dec_2024_values = df_now[(df_now['year'] == 1.0) & (df_now['month_num'] == 1.0)].iloc[0]
df_now['inflation_rate (x100)'] = dec_2024_values['inflation_rate (x100)']
df_now['resident_unemployment_rate'] = dec_2024_values['resident_unemployment_rate']
df_now['interest_rate'] = dec_2024_values['interest_rate']
df_now['avg_household_income'] = dec_2024_values['avg_household_income']
df_now['year'] = dec_2024_values['year']
df_now['month_num'] = dec_2024_values['month_num']

In [6]:
X = df_normalized_clean.drop(columns=['resale_price'])
y = df_normalized_clean['resale_price']
model.fit(X, y)

In [7]:
X_now = df_now.drop(columns=['resale_price'])
df_now['predicted_resale_price'] = model.predict(X_now)

In [8]:
df_original = pd.read_csv('../data/cleaned/resale_price_cleaned.csv')
df_original = df_original.dropna(subset=['resale_price'])

In [9]:
original_max = df_original['resale_price'].max()
original_min = df_original['resale_price'].min()
df_now['prediction_reverted'] = df_now['predicted_resale_price'] * (original_max - original_min) + original_min

In [10]:
original_flat_type_max = df_original['flat_type'].max()
original_flat_type_min = df_original['flat_type'].min()
df_now['flat_type_reverted'] = df_now['flat_type'] * (original_flat_type_max - original_flat_type_min) + original_flat_type_min

In [11]:
original_storey_range_max = df_original['storey_range'].max()
original_storey_range_min = df_original['storey_range'].min()
df_now['storey_range_reverted'] = df_now['storey_range'] * (original_storey_range_max - original_storey_range_min) + original_storey_range_min

In [12]:
original_remaining_lease_max = df_original['remaining_lease'].max()
original_remaining_lease_min = df_original['remaining_lease'].min()
df_now['remaining_lease_reverted'] = df_now['remaining_lease'] * (original_remaining_lease_max - original_remaining_lease_min) + original_remaining_lease_min

In [14]:
df_now = df_now[['town', 'lat', 'lon', 'prediction_reverted', 'flat_type_reverted', 'storey_range_reverted',
                 'remaining_lease_reverted']]

In [15]:
df_now

Unnamed: 0,town,lat,lon,prediction_reverted,flat_type_reverted,storey_range_reverted,remaining_lease_reverted
0,BUKIT MERAH,0.000000,0.456109,469600.733240,3.0,13.0,58.657819
1,BUKIT MERAH,0.000000,0.456109,465276.784001,3.0,7.0,58.500149
2,BUKIT MERAH,0.002888,0.454289,440624.412133,3.0,4.0,58.657819
3,BUKIT MERAH,0.002888,0.454289,471497.528461,3.0,10.0,57.918743
4,BUKIT MERAH,0.005513,0.415298,448578.548316,3.0,7.0,58.578984
...,...,...,...,...,...,...,...
196981,SEMBAWANG,1.000000,0.429908,545377.700730,5.0,4.0,78.612865
196982,SEMBAWANG,1.000000,0.429908,574792.199009,5.0,7.0,78.366506
196983,SEMBAWANG,1.000000,0.429908,523915.624242,4.0,13.0,78.287671
196984,SEMBAWANG,1.000000,0.429908,600269.439104,5.0,13.0,78.120147


In [16]:
flat_mapping = {
    '1 ROOM': 1.0,
    '2 ROOM': 2.0,
    '3 ROOM': 3.0,
    '4 ROOM': 4.0,
    '5 ROOM': 5.0,
    'MULTI-GENERATION': 6.0,
    'EXECUTIVE': 7.0
}

# Create reverse mapping
reverse_flat_mapping = {v: k for k, v in flat_mapping.items()}

# Example: Apply to a column named "flat_type"
# (Assuming "flat_type" currently contains values like 1, 2, 3...)
df_now['flat_type'] = df_now['flat_type_reverted'].map(reverse_flat_mapping)

In [18]:
def convert_storey_range(val):
    try:
        val = int(val)
        return f"{val} To {val + 2}"
    except:
        return None

df_now['storey_range'] = df_now['storey_range_reverted'].apply(convert_storey_range)


In [28]:
original_lat_max = df_original['lat'].max()
original_lat_min = df_original['lat'].min()
df_now['lat_reverted'] = df_now['lat'] * (original_lat_max - original_lat_min) + original_lat_min

In [29]:
original_lon_max = df_original['lon'].max()
original_lon_min = df_original['lon'].min()
df_now['lon_reverted'] = df_now['lon'] * (original_lon_max - original_lon_min) + original_lon_min

In [36]:
df_now

Unnamed: 0,town,lat,lon,prediction_reverted,flat_type_reverted,storey_range_reverted,remaining_lease_reverted,flat_type,storey_range,lat_reverted,lon_reverted
0,BUKIT MERAH,0.000000,0.456109,469600.733240,3.0,13.0,58.657819,3 ROOM,13 To 15,1.270380,103.823236
1,BUKIT MERAH,0.000000,0.456109,465276.784001,3.0,7.0,58.500149,3 ROOM,7 To 9,1.270380,103.823236
2,BUKIT MERAH,0.002888,0.454289,440624.412133,3.0,4.0,58.657819,3 ROOM,4 To 6,1.270919,103.822685
3,BUKIT MERAH,0.002888,0.454289,471497.528461,3.0,10.0,57.918743,3 ROOM,10 To 12,1.270919,103.822685
4,BUKIT MERAH,0.005513,0.415298,448578.548316,3.0,7.0,58.578984,3 ROOM,7 To 9,1.271409,103.810888
...,...,...,...,...,...,...,...,...,...,...,...
196981,SEMBAWANG,1.000000,0.429908,545377.700730,5.0,4.0,78.612865,5 ROOM,4 To 6,1.457071,103.815308
196982,SEMBAWANG,1.000000,0.429908,574792.199009,5.0,7.0,78.366506,5 ROOM,7 To 9,1.457071,103.815308
196983,SEMBAWANG,1.000000,0.429908,523915.624242,4.0,13.0,78.287671,4 ROOM,13 To 15,1.457071,103.815308
196984,SEMBAWANG,1.000000,0.429908,600269.439104,5.0,13.0,78.120147,5 ROOM,13 To 15,1.457071,103.815308


In [37]:
df_geo = pd.read_csv('../../Local/hdb_geospatial.csv')

In [38]:
df_geo

Unnamed: 0.1,Unnamed: 0,latitude,longitude,nearest_mrt_distance,nearest_bus_distance,address,postal_code,nearest_mrt_name,nearest_bus_name
0,0,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
1,1,1.270919,103.822685,621.123943,62.405683,42 TELOK BLANGAH RISE,090042,HARBOURFRONT MRT STATION,BLK 41
2,2,1.271409,103.810888,146.093830,106.993404,44 TELOK BLANGAH DR,100044,TELOK BLANGAH MRT STATION,TELOK BLANGAH STN
3,3,1.271463,103.825683,815.989388,105.693373,101 BT PURMEI RD,090101,HARBOURFRONT MRT STATION,THE PEARL /@ MT FABER
4,4,1.271691,103.809852,104.869176,69.173326,46 TELOK BLANGAH DR,100046,TELOK BLANGAH MRT STATION,OPP TELOK BLANGAH STN
...,...,...,...,...,...,...,...,...,...
9619,9619,1.456235,103.814292,1023.357587,39.265131,463C SEMBAWANG DR,753463,SEMBAWANG MRT STATION,OPP BLK 484A CP
9620,9620,1.456425,103.815858,942.929864,117.987780,485 ADMIRALTY LINK,750485,SEMBAWANG MRT STATION,Blk 483
9621,9621,1.456474,103.817181,884.749514,111.206094,492 ADMIRALTY LINK,750492,SEMBAWANG MRT STATION,Blk 491
9622,9622,1.456546,103.816764,909.789781,157.794168,493 ADMIRALTY LINK,750493,SEMBAWANG MRT STATION,Blk 491


In [39]:
df_merged = pd.merge(df_now, df_geo, left_on=['lat_reverted', 'lon_reverted'], right_on=['latitude', 'longitude'], how='inner')

In [None]:
df_merged = df_merged.drop(columns=['lat', 'lon', 'lat_reverted', 'lon_reverted', 'flat_type_reverted', 'storey_range_reverted'])

In [42]:
df_merged.dropna(inplace = True)

In [45]:
df_merged = df_merged.drop(columns = ['Unnamed: 0'])

In [46]:
df_merged

Unnamed: 0,town,prediction_reverted,remaining_lease_reverted,flat_type,storey_range,latitude,longitude,nearest_mrt_distance,nearest_bus_distance,address,postal_code,nearest_mrt_name,nearest_bus_name
0,BUKIT MERAH,469600.733240,58.657819,3 ROOM,13 To 15,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
1,BUKIT MERAH,465276.784001,58.500149,3 ROOM,7 To 9,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
2,BUKIT MERAH,470350.236403,57.839908,3 ROOM,10 To 12,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
3,BUKIT MERAH,426223.917098,57.514715,3 ROOM,1 To 3,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
4,BUKIT MERAH,441590.377206,57.514715,3 ROOM,4 To 6,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
...,...,...,...,...,...,...,...,...,...,...,...,...,...
197023,SEMBAWANG,559855.280263,89.945360,4 ROOM,1 To 3,1.456235,103.814292,1023.357587,39.265131,463C SEMBAWANG DR,753463,SEMBAWANG MRT STATION,OPP BLK 484A CP
197024,SEMBAWANG,559398.316363,89.866525,4 ROOM,1 To 3,1.456235,103.814292,1023.357587,39.265131,463C SEMBAWANG DR,753463,SEMBAWANG MRT STATION,OPP BLK 484A CP
197025,SEMBAWANG,563182.081838,89.866525,4 ROOM,4 To 6,1.456235,103.814292,1023.357587,39.265131,463C SEMBAWANG DR,753463,SEMBAWANG MRT STATION,OPP BLK 484A CP
197026,SEMBAWANG,585408.426727,89.866525,4 ROOM,10 To 12,1.456235,103.814292,1023.357587,39.265131,463C SEMBAWANG DR,753463,SEMBAWANG MRT STATION,OPP BLK 484A CP


In [47]:
df_merged.to_csv('/Users/erica/Desktop/Y3S2/DSE3101/Local/find_your_ideal_home.csv', index=False)