In [1]:
import pandas as pd
from shapely.geometry import Point, shape
import requests
import json
import geopandas as gpd
from math import radians, cos, sin, sqrt, atan2

In [11]:
df_resale = pd.read_csv("../data/cleaned/resale_price_cleaned.csv")

In [12]:
df_resale = df_resale.drop(columns=["town"])

### Normalize

In [13]:
# For each column, store the max and min for later normalization
min_max_dict = {
    col: {
        "min": df_resale[col].min(),
        "max": df_resale[col].max()
    }
    for col in df_resale.columns
}


In [14]:
def normalize_column(value, col):
    min_val = min_max_dict[col]["min"]
    max_val = min_max_dict[col]["max"]
    if max_val == min_val:
        return 0 
    return (value - min_val) / (max_val - min_val)


In [15]:
ONEMAP_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiI1M2M4NGU0YmJlMWVlZDhmMDczNDk4ODVmZDExYWRjOSIsImlzcyI6Imh0dHA6Ly9pbnRlcm5hbC1hbGItb20tcHJkZXppdC1pdC1uZXctMTYzMzc5OTU0Mi5hcC1zb3V0aGVhc3QtMS5lbGIuYW1hem9uYXdzLmNvbS9hcGkvdjIvdXNlci9wYXNzd29yZCIsImlhdCI6MTc0Mzc1ODAyNSwiZXhwIjoxNzQ0MDE3MjI1LCJuYmYiOjE3NDM3NTgwMjUsImp0aSI6IkF6YWZjWGxDb2tNb0hmQ1AiLCJ1c2VyX2lkIjozNTA0LCJmb3JldmVyIjpmYWxzZX0.CMD124pML3xaJU45AklBASBYNmojp_wctKoRupiDkQ0"

### Get town, lat, lon, MRT, bus

In [16]:
# # Planning area
# def load_planning_area_polygons():
#     url = "https://www.onemap.gov.sg/api/public/popapi/getAllPlanningarea?year=2024"
#     headers = {"Authorization": ONEMAP_TOKEN}
#     response = requests.get(url,headers=headers)
#     return response.json()

# planning_area = load_planning_area_polygons()
# with open('../data/raw/planning_area.json', 'w') as f:
#     json.dump(planning_area, f, indent=2)


In [17]:
with open('../data/raw/planning_area.json') as f:
    planning_areas = json.load(f)['SearchResults']

In [18]:
import re

def is_valid_postal_code(postal_code):
    return bool(re.match(r"^\d{6}$", str(postal_code)))

In [19]:
def get_coordinates_from_postal(postal_code):
    if not is_valid_postal_code(postal_code):
        return None, None
    
    url = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={postal_code}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
    headers = {"Authorization": ONEMAP_TOKEN}
    response = requests.get(url, headers=headers)
    data = response.json()
    
    if data["found"] > 0 and data["results"][0]["POSTAL"] == str(postal_code):
        result = data["results"][0]
        return float(result["LATITUDE"]), float(result["LONGITUDE"])
    
    return None, None


def get_planning_area_from_point(lat, lon, planning_areas):
    point = Point(lon, lat) 
    
    for area in planning_areas:
        geojson = json.loads(area['geojson'])  
        polygon = shape(geojson)
        
        if polygon.contains(point):
            return area['pln_area_n']
    
    return None

def get_planning_area_from_postal(postal_code):
    lat, lon = get_coordinates_from_postal(postal_code)
    if (lat is None) or (lon is None):
        return "Invalid postal code"
    return get_planning_area_from_point(lat, lon, planning_areas)

In [20]:
get_coordinates_from_postal(1)

(None, None)

In [21]:
df_resale

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,flat_model,remaining_lease,resale_price,lat,lon,nearest_mrt_distance,...,recreation_score,healthcare_score,CPI (base 2024-12),inflation_rate (x100),building_age_2025,total_unemployment_rate,resident_unemployment_rate,interest_rate,fx_rate,avg_household_income
0,2017-03,3,13,59.0,Improved,58.08,343082.552693,1.270380,103.823236,580.883827,...,29.626696,55.974066,84.527761,0.041405,49,3.1,2.2,0.5687,1.4057,4834.854015
1,2017-05,3,7,59.0,Improved,57.92,342964.842883,1.270380,103.823236,580.883827,...,29.626696,55.974066,84.556772,0.340212,49,3.1,2.2,0.4503,1.3946,4834.854015
2,2018-01,3,10,59.0,Improved,57.25,325432.170445,1.270380,103.823236,580.883827,...,29.626696,55.974066,84.503016,-0.191484,49,2.9,2.1,0.8294,1.3220,4834.854015
3,2018-06,3,1,59.0,Improved,56.92,294450.027637,1.270380,103.823236,580.883827,...,29.626696,55.974066,84.904050,0.056310,49,2.9,2.1,1.3426,1.3474,4834.854015
4,2018-06,3,4,59.0,Improved,56.92,300339.028190,1.270380,103.823236,580.883827,...,29.626696,55.974066,84.904050,0.056310,49,2.9,2.1,1.3426,1.3474,4834.854015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201055,2024-07,5,7,110.0,Improved,78.08,555541.334850,1.457071,103.815308,1035.713889,...,25.048367,8.851718,99.002534,-0.253604,22,2.7,2.0,3.4336,1.3469,2348.203924
201056,2024-08,4,13,91.0,Model A,78.00,501574.951425,1.457071,103.815308,1035.713889,...,25.048367,8.851718,99.685999,0.690351,22,2.7,2.0,3.5355,1.3159,2348.203924
201057,2024-10,5,13,111.0,Improved,77.83,630372.023708,1.457071,103.815308,1035.713889,...,25.048367,8.851718,99.623711,-0.313346,22,2.7,2.0,3.1559,1.3090,2348.203924
201058,2024-10,5,7,110.0,Improved,77.83,552077.409298,1.457071,103.815308,1035.713889,...,25.048367,8.851718,99.623711,-0.313346,22,2.7,2.0,3.1559,1.3090,2348.203924


In [22]:
bus_stops = gpd.read_file("../data/raw/BusStopLocation_Nov2024/BusStop.shp")
MRT_stops = pd.read_csv("../data/raw/MRT Stations.csv")
MRT_stops

Unnamed: 0.1,Unnamed: 0,OBJECTID,STN_NAME,STN_NO,geometry,Latitude,Longitude
0,0,1,EUNOS MRT STATION,EW7,POINT (103.9032524667383 1.319778951553637),1.319779,103.903252
1,1,2,CHINESE GARDEN MRT STATION,EW25,POINT (103.7325967380734 1.342352820874744),1.342353,103.732597
2,2,3,KHATIB MRT STATION,NS14,POINT (103.8329799077383 1.417383370153547),1.417383,103.832980
3,3,4,KRANJI MRT STATION,NS7,POINT (103.7621654109002 1.425177698770448),1.425178,103.762165
4,4,5,REDHILL MRT STATION,EW18,POINT (103.816816670149 1.289562726402453),1.289563,103.816817
...,...,...,...,...,...,...,...
166,166,198,SPRINGLEAF MRT STATION,TE4,POINT (103.8180818498627 1.398160861025955),1.398161,103.818082
167,167,197,LENTOR MRT STATION,TE5,POINT (103.8364694869142 1.385061946926286),1.385062,103.836469
168,168,196,MAYFLOWER MRT STATION,TE6,POINT (103.8368239320149 1.372086638674201),1.372087,103.836824
169,169,195,BRIGHT HILL MRT STATION,TE7,POINT (103.8329359578363 1.363308098095808),1.363308,103.832936


In [23]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # radius of Earth in meters
    phi1, phi2 = radians(lat1), radians(lat2)
    dphi = radians(lat2 - lat1)
    dlambda = radians(lon2 - lon1)
    a = sin(dphi/2)**2 + cos(phi1)*cos(phi2)*sin(dlambda/2)**2
    return 2*R*atan2(sqrt(a), sqrt(1 - a))

from pyproj import Transformer

transformer = Transformer.from_crs("EPSG:3414", "EPSG:4326", always_xy=True)

def svy21_to_wgs84(easting, northing):
    lon, lat = transformer.transform(easting, northing)
    return lat, lon


bus_stops['x_coord'] = bus_stops.geometry.apply(lambda geom: geom.x)
bus_stops['y_coord'] = bus_stops.geometry.apply(lambda geom: geom.y)
bus_stops['Latitude'], bus_stops['Longitude'] = zip(*bus_stops.apply(lambda row: svy21_to_wgs84(row['x_coord'], row['y_coord']), axis=1))

def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # Radius of Earth in meters
    phi1, phi2 = radians(lat1), radians(lat2)
    dphi = radians(lat2 - lat1)
    dlambda = radians(lon2 - lon1)
    a = sin(dphi/2)**2 + cos(phi1)*cos(phi2)*sin(dlambda/2)**2
    return 2 * R * atan2(sqrt(a), sqrt(1 - a))

def get_nearest_distances(lat, lon):
    MRT_stops['distance'] = MRT_stops.apply(
        lambda row: haversine(lat, lon, row['Latitude'], row['Longitude']), axis=1
    )
    nearest_mrt_distance = MRT_stops['distance'].min()

    # Calculate distances to all bus stops
    bus_stops['distance'] = bus_stops.apply(
        lambda row: haversine(lat, lon, row['Latitude'], row['Longitude']), axis=1
    )
    nearest_bus_distance = bus_stops['distance'].min()

    return nearest_mrt_distance, nearest_bus_distance



In [24]:
get_nearest_distances(1.3521, 103.8198)  # Example coordinates for Singapore

(1482.9413042287379, 1151.2215422154368)

### Get amenity

In [25]:
def get_amenity_score(hdb_lat, hdb_lon, amenity_type):
 
    if amenity_type == "education":
        df = pd.read_csv("../data/raw/Amenities_school.csv")
    elif amenity_type == "healthcare":
        df = pd.read_csv("../data/raw/Amenities_healthcare.csv")
    elif amenity_type == "shopping":
        df = pd.read_csv("../data/raw/Amenities_shopping.csv")
    elif amenity_type == "food":
        df = pd.read_csv("../data/raw/Amenities_food.csv")
    elif amenity_type == "recreation":
        df = pd.read_csv("../data/raw/Amenities_recreation.csv")
    else:
        raise ValueError(f"Unsupported amenity type: {amenity_type}")

    scores = []
    for _, row in df.iterrows():
        distance = haversine(hdb_lat, hdb_lon, row['lat'], row['lon'])
        if distance <= 3000:
            score = 1 * 1000 / (distance + 50)
            scores.append(score)
    
    total_score = sum(scores)

    return total_score


In [26]:
print(get_amenity_score(1.28064950442563,103.810750913525,"healthcare"))

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/Amenities_healthcare.csv'

### Get religion, avg household income

In [27]:
def get_religion (town):
    # religion
    df_religion = pd.read_csv("../data/raw/religion_2020.csv")
    df_religion.rename(columns = {"Number":"town"}, inplace = True)
    df_religion['town'] = df_religion['town'].replace('Kallang', 'Kallang/Whampoa')
    # Combine 'Outram', 'Downtown Core', 'River Valley', 'Novena' into 'Central Area'
    central_area_row = df_religion[df_religion['town'].isin(['Outram', 'Downtown Core', 'River Valley', 'Novena'])].sum(numeric_only=True)
    central_area_row['town'] = 'CENTRAL AREA'
    df_religion = df_religion[~df_religion['town'].isin(['Outram', 'Downtown Core', 'River Valley', 'Novena', 'Total','Others'])]
    df_religion = pd.concat([df_religion, pd.DataFrame([central_area_row])], ignore_index=True)
    # Capitalize the 'town' column
    df_religion['town'] = df_religion['town'].str.upper()

    for col in df_religion.columns:
        if col != "town":
            df_religion[col] = pd.to_numeric(df_religion[col], errors='coerce')
            df_religion[col] = df_religion[col].astype(float)
    for col in df_religion.columns:
        if col != "town" and col != "Total":
            df_religion[col] = df_religion[col] / df_religion["Total"]

    df = pd.DataFrame()

    for _, row in df_religion.iterrows():
        if town == row['town']:
            df['NoReligion'] = [row['NoReligion']]
            df['Buddhism'] = [row['Buddhism']]
            df['Taoism1'] = [row['Taoism1']]
            df['Islam'] = [row['Islam']]
            df['Hinduism'] = [row['Hinduism']]
            df['Sikhism'] = [row['Sikhism']]
            df['Christianity_Catholic'] = [row['Christianity_Catholic']]
            df['Christianity_OtherChristians'] = [row['Christianity_OtherChristians']]
            df['OtherReligions'] = [row['OtherReligions']]
            break
    
    return df


In [28]:
def get_income (town):
    
    # avg household income
    df_income = pd.read_csv("../data/cleaned/resale_price_cleaned.csv")
    df_income = df_income[df_income['month'] == '2024-12']
    df_income = df_income[['town', 'avg_household_income']]
    df_income = df_income.drop_duplicates()

    df = pd.DataFrame()
    
    for _, row in df_income.iterrows():
        if town == row['town']:
            df['avg_household_income'] = [row['avg_household_income']]
            break


    return df

In [29]:
df_income = pd.read_csv("../data/cleaned/resale_price_cleaned.csv")

In [30]:
def get_flat_type(flat_type):
    if flat_type == "1 Room":
        return 1
    elif flat_type == "2 Room":
        return 2
    elif flat_type == "3 Room":
        return 3
    elif flat_type == "4 Room":
        return 4
    elif flat_type == "5 Room":
        return 5
    elif flat_type == "Executive":
        return 6
    elif flat_type == "Multi-Generation":
        return 7
    else:
        raise ValueError(f"Unsupported flat type: {flat_type}")

In [31]:
df_income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201060 entries, 0 to 201059
Data columns (total 25 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   month                       201060 non-null  object 
 1   town                        201060 non-null  object 
 2   flat_type                   201060 non-null  int64  
 3   storey_range                201060 non-null  int64  
 4   floor_area_sqm              201060 non-null  float64
 5   flat_model                  201060 non-null  object 
 6   remaining_lease             201060 non-null  float64
 7   resale_price                196986 non-null  float64
 8   lat                         201060 non-null  float64
 9   lon                         201060 non-null  float64
 10  nearest_mrt_distance        201060 non-null  float64
 11  nearest_bus_distance        201060 non-null  float64
 12  education_score             201060 non-null  float64
 13  shopping_score

In [32]:
def price_predict(storey_range, flat_type, remaining_lease, postal_code, model):

    # town, lat, lon
    town = get_planning_area_from_postal(postal_code)
    lat, lon = get_coordinates_from_postal(postal_code)
    mrt = get_nearest_distances(lat,lon)[0]
    bus = get_nearest_distances(lat,lon)[1]

    # amenity score
    education_score = get_amenity_score(lat, lon, "education")
    shopping_score = get_amenity_score(lat, lon, "shopping")
    food_score = get_amenity_score(lat, lon, "food")
    recreation_score = get_amenity_score(lat, lon, "recreation")
    healthcare_score = get_amenity_score(lat, lon, "healthcare")

    # demographic
    religion_df = get_religion(town)
    income_df = get_income(town)

    X_input = pd.DataFrame([{
        'town': town,
        'storey_range': storey_range,
        'flat_type': get_flat_type(flat_type),
        'remaining_lease': remaining_lease,
        'lat': lat,
        'lon': lon,
        'nearest_mrt_distance': mrt,
        'nearest_bus_distance': bus,
        'education_score': education_score,
        'shopping_score': shopping_score,
        'food_score': food_score,
        'recreation_score': recreation_score,
        'healthcare_score': healthcare_score,

        # Based on 2024-12 data
        'inflation_rate (x100)': 0.3468,
        'resident_unemployment_rate': 2.0,
        'interest_rate': 2.1123,
        
        'avg_household_income': income_df['avg_household_income'].iloc[0],
        'NoReligion': religion_df['NoReligion'].iloc[0],
        'Buddhism': religion_df['Buddhism'].iloc[0],
        'Taoism1': religion_df['Taoism1'].iloc[0],
        'Islam': religion_df['Islam'].iloc[0],
        'Hinduism': religion_df['Hinduism'].iloc[0],
        'Sikhism': religion_df['Sikhism'].iloc[0],
        'Christianity_Catholic': religion_df['Christianity_Catholic'].iloc[0],
        'Christianity_OtherChristians': religion_df['Christianity_OtherChristians'].iloc[0],
        'OtherReligions': religion_df['OtherReligions'].iloc[0],
        'priv_prop': 27531.0,
    

        'year': 2025,
        'month_num': 4
        }])
    
    # for each input, make them normalized by using their (original value - min)/(max - min)
    for col in X_input.columns:
        if col in min_max_dict:
            X_input[col] = X_input[col].apply(lambda x: normalize_column(x, col))
    y = model.predict(X_input)
    y = y * (min_max_dict['resale_price']['max']- min_max_dict['resale_price']['min']) + min_max_dict['resale_price']['min']
    return [y[0],education_score, shopping_score, food_score, recreation_score, healthcare_score, mrt, bus]




In [33]:
min_max_dict

{'month': {'min': '2017-01', 'max': '2025-02'},
 'flat_type': {'min': 1, 'max': 7},
 'storey_range': {'min': 1, 'max': 49},
 'floor_area_sqm': {'min': 31.0, 'max': 366.7},
 'flat_model': {'min': '2-room', 'max': 'Type S2'},
 'remaining_lease': {'min': 40.08, 'max': 97.75},
 'resale_price': {'min': 152411.06959243302, 'max': 1604833.3531403835},
 'lat': {'min': 1.270379512, 'max': 1.457071216},
 'lon': {'min': 103.6852284, 'max': 103.9878045},
 'nearest_mrt_distance': {'min': 22.1639018, 'max': 3535.397706},
 'nearest_bus_distance': {'min': 15.43904898, 'max': 392.2674966},
 'education_score': {'min': 0.0, 'max': 50.15530334701583},
 'shopping_score': {'min': 0.8820520610078136, 'max': 135.63259326647125},
 'food_score': {'min': 0.3497729027784391, 'max': 48.37232598769496},
 'recreation_score': {'min': 11.83005040709938, 'max': 170.35992294508145},
 'healthcare_score': {'min': 0.0, 'max': 330.67760475375417},
 'CPI (base 2024-12)': {'min': 84.2393576627388, 'max': 100.0},
 'inflation_r

In [34]:
import joblib
model = joblib.load('model.pkl')

FileNotFoundError: [Errno 2] No such file or directory: 'model.pkl'

In [None]:
print (price_predict(4, "4 Room", 40, 597539, model))

# Example usage
predicted_price, education_score, shopping_score, food_score, recreation_score, healthcare_score, mrt, bus = price_predict(1, "4 Room", 50, 597539, model)
print(f"Predicted Price: {predicted_price}")
print(f"Education Score: {education_score}")
print(f"Shopping Score: {shopping_score}")
print(f"Food Score: {food_score}")
print(f"Recreation Score: {recreation_score}")
print(f"Healthcare Score: {healthcare_score}")
print(f"Nearest MRT Distance: {mrt}")
print(f"Nearest Bus Distance: {bus}")

[646170.4860165251, 12.267949250553432, 32.73928057477365, 4.506878852764179, 42.73739867560851, 37.038618803164645, 1001.7169791140378, 131.3132051203298]
Predicted Price: 629472.0823508442
Education Score: 12.267949250553432
Shopping Score: 32.73928057477365
Food Score: 4.506878852764179
Recreation Score: 42.73739867560851
Healthcare Score: 37.038618803164645
Nearest MRT Distance: 1001.7169791140378
Nearest Bus Distance: 131.3132051203298
