In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

df = pd.read_csv('/Users/erica/Desktop/Y3S2/DSE3101/Local/reslae_price_normalized_for_ML.csv')

In [6]:
df = df.drop(columns=['flat_model', 'building_age_2025', 'total_unemployment_rate', 
                      'Chinese', 'Malays', 'Indians', 'Others', 'fx_rate', 'floor_area_sqm'])
df_normalized_clean = df.copy()

def normalize(col):
    return (col - col.min()) / (col.max() - col.min()) if col.max() != col.min() else col

df_normalized_clean['month'] = pd.to_datetime(df_normalized_clean['month'])
df_normalized_clean['year'] = df_normalized_clean['month'].dt.year.astype(float)
df_normalized_clean['month_num'] = df_normalized_clean['month'].dt.month.astype(float)
df_normalized_clean = df_normalized_clean.drop(columns=['month'])
df_normalized_clean = df_normalized_clean.drop(columns = ['CPI (base 2024-12)'])

columns_to_normalize = ['inflation_rate (x100)', 'interest_rate', 'priv_prop',
                        'resident_unemployment_rate', 'month_num', 'year']

df_normalized_clean[columns_to_normalize] = df_normalized_clean[columns_to_normalize].apply(normalize)


categorical_features = ['town']
numerical_features = [
    'storey_range', 'remaining_lease',
    'lat', 'lon', 'nearest_mrt_distance', 'nearest_bus_distance',
    'education_score', 'shopping_score', 'food_score', 'recreation_score',
    'healthcare_score', 'inflation_rate (x100)',
    'resident_unemployment_rate',
    'interest_rate', 'avg_household_income', 'priv_prop', 'flat_type'
]
numerical_features.extend(['year', 'month_num'])
demographic_features = [
    'NoReligion', 'Buddhism', 'Taoism1', 'Islam', 'Hinduism', 'Sikhism',
    'Christianity_Catholic', 'Christianity_OtherChristians', 'OtherReligions'
]
numerical_features.extend(demographic_features)  # Add to numerical pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

df_now = df_normalized_clean.copy()

In [7]:
dec_2024_values = df_now[(df_now['year'] == 1.0) & (df_now['month_num'] == 1.0)].iloc[0]
df_now['inflation_rate (x100)'] = dec_2024_values['inflation_rate (x100)']
df_now['resident_unemployment_rate'] = dec_2024_values['resident_unemployment_rate']
df_now['interest_rate'] = dec_2024_values['interest_rate']
df_now['avg_household_income'] = dec_2024_values['avg_household_income']
df_now['year'] = dec_2024_values['year']
df_now['month_num'] = dec_2024_values['month_num']

In [8]:
import joblib
X_now = df_now.drop(columns=['resale_price'])
loaded_model = joblib.load('/Users/erica/Desktop/Y3S2/DSE3101/Local/best_resale_price_model.pkl')
df_now['predicted_resale_price'] = loaded_model.predict(X_now)

In [9]:
df_original = pd.read_csv('../data/cleaned/resale_price_cleaned.csv')
df_original = df_original.dropna(subset=['resale_price'])

In [10]:
original_max = df_original['resale_price'].max()
original_min = df_original['resale_price'].min()
df_now['prediction_reverted'] = df_now['predicted_resale_price'] * (original_max - original_min) + original_min

In [11]:
original_flat_type_max = df_original['flat_type'].max()
original_flat_type_min = df_original['flat_type'].min()
df_now['flat_type_reverted'] = df_now['flat_type'] * (original_flat_type_max - original_flat_type_min) + original_flat_type_min

In [12]:
original_storey_range_max = df_original['storey_range'].max()
original_storey_range_min = df_original['storey_range'].min()
df_now['storey_range_reverted'] = df_now['storey_range'] * (original_storey_range_max - original_storey_range_min) + original_storey_range_min

In [13]:
original_remaining_lease_max = df_original['remaining_lease'].max()
original_remaining_lease_min = df_original['remaining_lease'].min()
df_now['remaining_lease_reverted'] = df_now['remaining_lease'] * (original_remaining_lease_max - original_remaining_lease_min) + original_remaining_lease_min

In [14]:
df_now = df_now[['town', 'lat', 'lon', 'prediction_reverted', 'flat_type_reverted', 'storey_range_reverted',
                 'remaining_lease_reverted']]

In [15]:
df_now['flat_type_reverted'] = df_now['flat_type_reverted'].round(1)

In [17]:
flat_mapping = {
    '1 ROOM': 1.0,
    '2 ROOM': 2.0,
    '3 ROOM': 3.0,
    '4 ROOM': 4.0,
    '5 ROOM': 5.0,
    'MULTI-GENERATION': 6.0,
    'EXECUTIVE': 7.0
}

# Create reverse mapping
reverse_flat_mapping = {v: k for k, v in flat_mapping.items()}

# Example: Apply to a column named "flat_type"
# (Assuming "flat_type" currently contains values like 1, 2, 3...)
df_now['flat_type'] = df_now['flat_type_reverted'].map(reverse_flat_mapping)

In [18]:
def convert_storey_range(val):
    try:
        val = int(val)
        return f"{val} To {val + 2}"
    except:
        return None

df_now['storey_range'] = df_now['storey_range_reverted'].apply(convert_storey_range)


In [19]:
original_lat_max = df_original['lat'].max()
original_lat_min = df_original['lat'].min()
df_now['lat_reverted'] = df_now['lat'] * (original_lat_max - original_lat_min) + original_lat_min

In [20]:
original_lon_max = df_original['lon'].max()
original_lon_min = df_original['lon'].min()
df_now['lon_reverted'] = df_now['lon'] * (original_lon_max - original_lon_min) + original_lon_min

In [21]:
df_geo = pd.read_csv('../../Local/hdb_geospatial.csv')

In [22]:
df_merged = pd.merge(df_now, df_geo, left_on=['lat_reverted', 'lon_reverted'], right_on=['latitude', 'longitude'], how='inner')

In [23]:
df_merged = df_merged.drop(columns=['lat', 'lon', 'lat_reverted', 'lon_reverted', 'flat_type_reverted', 'storey_range_reverted'])

In [24]:
df_merged.dropna(inplace = True)

In [25]:
df_merged = df_merged.drop(columns = ['Unnamed: 0'])

In [26]:
df_merged.to_csv('/Users/erica/Desktop/Y3S2/DSE3101/Local/find_your_ideal_home_corrected.csv', index=False)