In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

df = pd.read_csv('/Users/erica/Desktop/Y3S2/DSE3101/Local/reslae_price_normalized_for_ML.csv')

In [2]:
df = df.drop(columns=['flat_model', 'building_age_2025', 'total_unemployment_rate', 
                      'Chinese', 'Malays', 'Indians', 'Others', 'fx_rate', 'floor_area_sqm'])
df_normalized_clean = df.copy()

def normalize(col):
    return (col - col.min()) / (col.max() - col.min()) if col.max() != col.min() else col

df_normalized_clean['month'] = pd.to_datetime(df_normalized_clean['month'])
df_normalized_clean['year'] = df_normalized_clean['month'].dt.year.astype(float)
df_normalized_clean['month_num'] = df_normalized_clean['month'].dt.month.astype(float)
df_normalized_clean = df_normalized_clean.drop(columns=['month'])
df_normalized_clean = df_normalized_clean.drop(columns = ['CPI (base 2024-12)'])

columns_to_normalize = ['inflation_rate (x100)', 'interest_rate', 'priv_prop',
                        'resident_unemployment_rate', 'month_num', 'year']

df_normalized_clean[columns_to_normalize] = df_normalized_clean[columns_to_normalize].apply(normalize)


categorical_features = ['town']
numerical_features = [
    'storey_range', 'remaining_lease',
    'lat', 'lon', 'nearest_mrt_distance', 'nearest_bus_distance',
    'education_score', 'shopping_score', 'food_score', 'recreation_score',
    'healthcare_score', 'inflation_rate (x100)',
    'resident_unemployment_rate',
    'interest_rate', 'avg_household_income', 'priv_prop', 'flat_type'
]
numerical_features.extend(['year', 'month_num'])
demographic_features = [
    'NoReligion', 'Buddhism', 'Taoism1', 'Islam', 'Hinduism', 'Sikhism',
    'Christianity_Catholic', 'Christianity_OtherChristians', 'OtherReligions'
]
numerical_features.extend(demographic_features)  # Add to numerical pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

df_now = df_normalized_clean.copy()

In [3]:
dec_2024_values = df_now[(df_now['year'] == 1.0) & (df_now['month_num'] == 1.0)].iloc[0]
df_now['inflation_rate (x100)'] = dec_2024_values['inflation_rate (x100)']
df_now['resident_unemployment_rate'] = dec_2024_values['resident_unemployment_rate']
df_now['interest_rate'] = dec_2024_values['interest_rate']
df_now['avg_household_income'] = dec_2024_values['avg_household_income']
df_now['year'] = dec_2024_values['year']
df_now['month_num'] = dec_2024_values['month_num']

In [5]:
import joblib
X_now = df_now.drop(columns=['resale_price'])
loaded_model = joblib.load('/Users/erica/Desktop/Y3S2/DSE3101/Local/best_resale_price_model.pkl')
df_now['predicted_resale_price'] = loaded_model.predict(X_now)

In [7]:
df_original = pd.read_csv('../data/cleaned/resale_price_cleaned.csv')
df_original = df_original.dropna(subset=['resale_price'])

In [8]:
original_max = df_original['resale_price'].max()
original_min = df_original['resale_price'].min()
df_now['prediction_reverted'] = df_now['predicted_resale_price'] * (original_max - original_min) + original_min

In [9]:
original_flat_type_max = df_original['flat_type'].max()
original_flat_type_min = df_original['flat_type'].min()
df_now['flat_type_reverted'] = df_now['flat_type'] * (original_flat_type_max - original_flat_type_min) + original_flat_type_min

In [10]:
original_storey_range_max = df_original['storey_range'].max()
original_storey_range_min = df_original['storey_range'].min()
df_now['storey_range_reverted'] = df_now['storey_range'] * (original_storey_range_max - original_storey_range_min) + original_storey_range_min

In [11]:
original_remaining_lease_max = df_original['remaining_lease'].max()
original_remaining_lease_min = df_original['remaining_lease'].min()
df_now['remaining_lease_reverted'] = df_now['remaining_lease'] * (original_remaining_lease_max - original_remaining_lease_min) + original_remaining_lease_min

In [12]:
df_now = df_now[['town', 'lat', 'lon', 'prediction_reverted', 'flat_type_reverted', 'storey_range_reverted',
                 'remaining_lease_reverted']]

In [13]:
df_now['flat_type_reverted'] = df_now['flat_type_reverted'].round(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_now['flat_type_reverted'] = df_now['flat_type_reverted'].round(1)


In [14]:
flat_mapping = {
    '1 ROOM': 1.0,
    '2 ROOM': 2.0,
    '3 ROOM': 3.0,
    '4 ROOM': 4.0,
    '5 ROOM': 5.0,
    'MULTI-GENERATION': 6.0,
    'EXECUTIVE': 7.0
}

# Create reverse mapping
reverse_flat_mapping = {v: k for k, v in flat_mapping.items()}

# Example: Apply to a column named "flat_type"
# (Assuming "flat_type" currently contains values like 1, 2, 3...)
df_now['flat_type'] = df_now['flat_type_reverted'].map(reverse_flat_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_now['flat_type'] = df_now['flat_type_reverted'].map(reverse_flat_mapping)


In [15]:
def convert_storey_range(val):
    try:
        val = int(val)
        return f"{val} To {val + 2}"
    except:
        return None

df_now['storey_range'] = df_now['storey_range_reverted'].apply(convert_storey_range)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_now['storey_range'] = df_now['storey_range_reverted'].apply(convert_storey_range)


In [16]:
original_lat_max = df_original['lat'].max()
original_lat_min = df_original['lat'].min()
df_now['lat_reverted'] = df_now['lat'] * (original_lat_max - original_lat_min) + original_lat_min

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_now['lat_reverted'] = df_now['lat'] * (original_lat_max - original_lat_min) + original_lat_min


In [17]:
original_lon_max = df_original['lon'].max()
original_lon_min = df_original['lon'].min()
df_now['lon_reverted'] = df_now['lon'] * (original_lon_max - original_lon_min) + original_lon_min

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_now['lon_reverted'] = df_now['lon'] * (original_lon_max - original_lon_min) + original_lon_min


In [18]:
df_geo = pd.read_csv('../../Local/hdb_geospatial.csv')

In [19]:
df_merged = pd.merge(df_now, df_geo, left_on=['lat_reverted', 'lon_reverted'], right_on=['latitude', 'longitude'], how='inner')

In [20]:
df_merged = df_merged.drop(columns=['lat', 'lon', 'lat_reverted', 'lon_reverted', 'flat_type_reverted', 'storey_range_reverted'])

In [21]:
df_merged.dropna(inplace = True)

In [22]:
df_merged = df_merged.drop(columns = ['Unnamed: 0'])

In [26]:
df_merged.to_csv('/Users/erica/Desktop/Y3S2/DSE3101/Local/find_your_ideal_home_corrected.csv', index=False)

In [23]:
df_merged

Unnamed: 0,town,prediction_reverted,remaining_lease_reverted,flat_type,storey_range,latitude,longitude,nearest_mrt_distance,nearest_bus_distance,address,postal_code,nearest_mrt_name,nearest_bus_name
0,BUKIT MERAH,472037.224855,58.657819,3 ROOM,13 To 15,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
1,BUKIT MERAH,468413.124490,58.500149,3 ROOM,7 To 9,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
2,BUKIT MERAH,465087.112993,57.839908,3 ROOM,10 To 12,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
3,BUKIT MERAH,420753.452627,57.514715,3 ROOM,1 To 3,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
4,BUKIT MERAH,435095.460445,57.514715,3 ROOM,4 To 6,1.270380,103.823236,580.884909,100.653218,43 TELOK BLANGAH RISE,090043,HARBOURFRONT MRT STATION,BLK 41
...,...,...,...,...,...,...,...,...,...,...,...,...,...
197023,SEMBAWANG,558705.191472,89.945360,4 ROOM,1 To 3,1.456235,103.814292,1023.357587,39.265131,463C SEMBAWANG DR,753463,SEMBAWANG MRT STATION,OPP BLK 484A CP
197024,SEMBAWANG,558128.166497,89.866525,4 ROOM,1 To 3,1.456235,103.814292,1023.357587,39.265131,463C SEMBAWANG DR,753463,SEMBAWANG MRT STATION,OPP BLK 484A CP
197025,SEMBAWANG,563762.454834,89.866525,4 ROOM,4 To 6,1.456235,103.814292,1023.357587,39.265131,463C SEMBAWANG DR,753463,SEMBAWANG MRT STATION,OPP BLK 484A CP
197026,SEMBAWANG,589163.203063,89.866525,4 ROOM,10 To 12,1.456235,103.814292,1023.357587,39.265131,463C SEMBAWANG DR,753463,SEMBAWANG MRT STATION,OPP BLK 484A CP


In [24]:
df_merged['postal_code'].nunique()

9604

In [5]:
import geopandas as gpd
from shapely.geometry import shape, mapping
from bs4 import BeautifulSoup



In [None]:
gdf = gpd.read_file("/Users/erica/Desktop/Y3S2/DSE3101/Local/HDBExistingBuilding.geojson")
gdf['geometry'] = gdf['geometry'].apply(lambda geom: shape(mapping(geom)))


Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.8017 1.44919 0, 103.80169 1.44..."
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.75023 1.35897 0, 103.75033 1.3..."
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.95811 1.35344 0, 103.95812 1.3..."
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.74972 1.39435 0, 103.74967 1.3..."
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.81731 1.27887 0, 103.81735 1.2..."
...,...,...,...
12842,kml_12843,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.92823 1.33162 0, 103.92825 1.3..."
12843,kml_12844,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.95901 1.35865 0, 103.95901 1.3..."
12844,kml_12845,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.68964 1.34374 0, 103.68964 1.3..."
12845,kml_12846,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.74927 1.39407 0, 103.74932 1.3..."


In [None]:
gdf['Description_clean'] = gdf['Description'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())


Unnamed: 0,Name,Description,geometry,Description_clean
0,kml_1,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.8017 1.44919 0, 103.80169 1.44...",Attributes BLK_NO 780C ST_COD WOC05L ENTITYI...
1,kml_2,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.75023 1.35897 0, 103.75033 1.3...",Attributes BLK_NO 373 ST_COD BUS09S ENTITYID...
2,kml_3,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.95811 1.35344 0, 103.95812 1.3...",Attributes BLK_NO 328 ST_COD TAS39U ENTITYID...
3,kml_4,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.74972 1.39435 0, 103.74967 1.3...",Attributes BLK_NO 771A ST_COD CHS27B ENTITYI...
4,kml_5,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.81731 1.27887 0, 103.81735 1.2...",Attributes BLK_NO 3A ST_COD TEC01D ENTITYID ...
...,...,...,...,...
12842,kml_12843,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.92823 1.33162 0, 103.92825 1.3...",Attributes BLK_NO 553 ST_COD BEA05R ENTITYID...
12843,kml_12844,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.95901 1.35865 0, 103.95901 1.3...",Attributes BLK_NO 497D ST_COD TAS23F ENTITYI...
12844,kml_12845,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.68964 1.34374 0, 103.68964 1.3...",Attributes BLK_NO 943 ST_COD JUS13F ENTITYID...
12845,kml_12846,<center><table><tr><th colspan='2' align='cent...,"POLYGON Z ((103.74927 1.39407 0, 103.74932 1.3...",Attributes BLK_NO 767 ST_COD CHS27B ENTITYID...


In [8]:
gdf['postal_code'] = gdf['Description_clean'].str.extract(r'POSTAL_COD\s+(\d{6})')
gdf = gdf [['postal_code', 'Description_clean']]

In [10]:
gdf.to_csv('/Users/erica/Desktop/Y3S2/DSE3101/Local/HDBExistingBuilding_cleaned.csv', index=False)