In [1]:
import pandas as pd  # Pandas for data manipulation
import numpy as np   # NumPy for numerical operations
import os

### Get OneMap API access token

In [2]:
import requests

# Replace with your OneMap API login credentials
email = "e1090510@u.nus.edu"
password = "Onemap12345!"

token_url = "https://www.onemap.gov.sg/api/auth/post/getToken"
data = {"email": email, "password": password}

response = requests.post(token_url, json=data)

if response.status_code == 200:
    access_token = response.json().get("access_token")
    print("Access Token:", access_token)
else:
    print("Failed to get token:", response.status_code, response.text)


Access Token: eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiI4MjJhZGUyMTE4OWE2NmNkNTg0NmQ4NzkyNDM0NWRjZSIsImlzcyI6Imh0dHA6Ly9pbnRlcm5hbC1hbGItb20tcHJkZXppdC1pdC1uZXctMTYzMzc5OTU0Mi5hcC1zb3V0aGVhc3QtMS5lbGIuYW1hem9uYXdzLmNvbS9hcGkvdjIvdXNlci9zZXNzaW9uIiwiaWF0IjoxNzQ1MzAxMTEzLCJleHAiOjE3NDU1NjAzMTMsIm5iZiI6MTc0NTMwMTExMywianRpIjoiVklSeENrVmJNVmhOdjlDcSIsInVzZXJfaWQiOjYxMzQsImZvcmV2ZXIiOmZhbHNlfQ.UiZMy5e9zA3U0UJPVa553nElJnD9UULk6u8fODajoy0


## Get coordinates of Amenities
* Healthcare: Clinics, hospitals
* Food: Restaurants, food courts
* Shopping: Supermarkets, malls
* Education: Schools
* Recreation: Parks, gyms, libraries

### Define function to get coordinates from OneMap API

In [4]:
onemap_search_url = "https://www.onemap.gov.sg/api/common/elastic/search"

def get_coordinates(df_orig, token):

    unique_add = df_orig["address"].unique()

    addr_coord = {}

    def get_coord(address):
        if address in addr_coord:  # If already fetched this address, return stored value
            return addr_coord[address]

        params = {
            "searchVal": address,
            "returnGeom": "Y",
            "getAddrDetails": "Y",
            "pageNum": 1
        }
        headers = {'Authorization': token}

        try:
            # Making an API request
            response = requests.get(onemap_search_url, headers=headers, params=params)
            
            if response.status_code == 200: # Checking if request was successful
                data = response.json() # Converts the response from JSON into a Python dictionary
                if data["found"] > 0:
                    result = data["results"][0]
                    addr_coord[address] = (float(result["LATITUDE"]), float(result["LONGITUDE"]))
                    return result["LATITUDE"], result["LONGITUDE"]
            
            addr_coord[address] = (None, None)  # Else, None if not found
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {address}: {e}")


        return None, None


    # Populates the address_coord dictionary
    for address in unique_add:
        get_coord(address)

    # Assign coordinates back to all flats
    df_orig["lat"] = df_orig["address"].map(lambda x: addr_coord[x][0])
    df_orig["lon"] = df_orig["address"].map(lambda x: addr_coord[x][1])

    df_orig["LatLng"] = df_orig["lat"].astype(str) + "," + df_orig["lon"].astype(str)
    df_orig["lat"] = df_orig["lat"].astype(float)
    df_orig["lon"] = df_orig["lon"].astype(float)

    return None

### ✅ Healthcare
from OpenStreetMap contributors

In [5]:
import geopandas as gpd

# Load KML file (assuming it contains Points, Lines, or Polygons)
df_health = gpd.read_file("../data/raw/health_facilities_points.kml", driver='KML')

In [6]:
df_health["lon"] = df_health.geometry.x
df_health["lat"] = df_health.geometry.y
df_health = df_health.drop(columns={"geometry", "Description"})

df_health["LatLng"] = df_health["lat"].astype(str) + "," + df_health["lon"].astype(str)

df_health["lat"] = df_health["lat"].astype(float)
df_health["lon"] = df_health["lon"].astype(float)

df_health.rename(columns={"Name":"name"}, inplace=True)
df_health["amenity_type"] = "healthcare"

df_health = df_health[["name", "amenity_type", "LatLng", "lat", "lon"]]

In [7]:
# Export to CSV
# df_health.to_csv(os.path.join("../data", "Amenities_healthcare.csv"), index=False)

### ✅ Food - hawker centers
* From Data.gov.sg, only have data of hawker centers

In [8]:
# Load KML file (assuming it contains Points, Lines, or Polygons)
df_hawkerc = gpd.read_file("../data/raw/HawkerCentresKML.kml", driver='KML')
df_hawkerc["lon"] = df_hawkerc.geometry.x
df_hawkerc["lat"] = df_hawkerc.geometry.y
df_hawkerc = df_hawkerc.drop(columns={"geometry", "Description"})

df_hawkerc["LatLng"] = df_hawkerc["lat"].astype(str) + "," + df_hawkerc["lon"].astype(str)

df_hawkerc["lat"] = df_hawkerc["lat"].astype(float)
df_hawkerc["lon"] = df_hawkerc["lon"].astype(float)

df_hawkerc.rename(columns={"Name":"name"}, inplace=True)
df_hawkerc["amenity_type"] = "food"

df_hawkerc = df_hawkerc[["name", "amenity_type", "LatLng", "lat", "lon"]]

In [9]:
# Export to CSV
# df_hawkerc.to_csv(os.path.join("../data", "Amenities_food.csv"), index=False)

In [10]:
df_hawkerc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          125 non-null    object 
 1   amenity_type  125 non-null    object 
 2   LatLng        125 non-null    object 
 3   lat           125 non-null    float64
 4   lon           125 non-null    float64
dtypes: float64(2), object(3)
memory usage: 5.0+ KB


### ✅ Shopping
* malls & supermarkets
* from Kaggle https://www.kaggle.com/datasets/karthikgangula/shopping-mall-coordinates?resource=download

In [11]:
df_mall = pd.read_csv("../data/raw/shopping_mall_coordinates.csv")
df_mall.rename(columns={"Mall Name":"name", "LATITUDE":"lat", "LONGITUDE":"lon"}, inplace=True)

df_mall["LatLng"] = df_mall["lat"].astype(str) + "," + df_mall["lon"].astype(str)

df_mall["lat"] = df_mall["lat"].astype(float)
df_mall["lon"] = df_mall["lon"].astype(float)

df_mall.rename(columns={"Name":"name"}, inplace=True)
df_mall["amenity_type"] = "shopping"

df_mall = df_mall[["name", "amenity_type", "LatLng", "lat", "lon"]]


Supermarkets

In [12]:
df_superm = pd.read_csv("../data/raw/ListingofSupermarkets.csv")
# Clean address
df_superm["premise_address"] = df_superm["premise_address"].str.split(" #").str[0]
df_superm["premise_address"] = df_superm["premise_address"].str.split(",").str[0]
df_superm.rename(columns={"business_name":"name", "premise_address":"address"}, inplace=True)

In [13]:
get_coordinates(df_superm, access_token)

In [14]:
df_superm["amenity_type"] = "shopping"
df_superm = df_superm[["name", "amenity_type", "LatLng", "lat", "lon"]]
df_shopping = pd.concat([df_mall, df_superm], ignore_index=True)

In [15]:
# Export to CSV
# df_shopping.to_csv(os.path.join("../data", "Amenities_shopping.csv"), index=False)

### ✅ Education 
* primary, secondary, junior college, centralised institutes, mixed levels
* from Data.gov.sg

In [16]:
df_school = pd.read_csv('../data/raw/Schools_data.csv')
get_coordinates(df_school, access_token)

In [17]:
df_school["amenity_type"] = "education"
df_school.rename(columns={"school_name": "name"}, inplace=True)

df_school = df_school[["name", "amenity_type", "LatLng", "lat", "lon"]]

In [18]:
# Export to CSV
# df_school.to_csv(os.path.join("../data", "Amenities_school.csv"), index=False)

### ✅ Recreation 
* museums, libraries, theatres, parks, nature reserves
* from OneMap theme

In [19]:
theme_info_url = "https://www.onemap.gov.sg/api/public/themesvc/getAllThemesInfo?moreInfo=Y"
headers = {"Authorization": access_token}

# Investigate themes available
try:
    response = requests.get(theme_info_url, headers=headers)
    amenity_type = response.json().get("Theme_Names", [])

    df_amenity_type_unique = pd.DataFrame(amenity_type, columns=["THEMENAME", "QUERYNAME", "CATEGORY"])
    df_amenity_type_unique = df_amenity_type_unique.drop_duplicates()

    #print(df_amenity_type_unique)
    df_amenity_type_unique.to_csv(os.path.join("../data/raw", "Unique_themes.csv"), index=False)
except KeyError:
    print(f"Error fetching themes: {response.status_code}, {response.text}")

In [20]:
# Select variables
# Recreation: CATEGORY == Culture, Sports, Recreation

# Filter amenity types
df_amenity_type = df_amenity_type_unique[
    (df_amenity_type_unique["CATEGORY"] == "Sports") |
    ((df_amenity_type_unique["CATEGORY"] == "Culture") & (df_amenity_type_unique["THEMENAME"].isin(["Libraries", "Museums", "Theatre"]))) |
    ((df_amenity_type_unique["CATEGORY"] == "Recreation") & (df_amenity_type_unique["THEMENAME"].isin(["Parks", "Nature Reserves Gazette 2005"]))) 
]

df_amenity_type.reset_index()

retrieve_theme_url = "https://www.onemap.gov.sg/api/public/themesvc/retrieveTheme"
headers = {"Authorization": access_token}
query_names = df_amenity_type["QUERYNAME"]

In [21]:
df_recre = pd.DataFrame(columns=["NAME", "Category", "LatLng"])

for query_name in query_names:
    params = {"queryName": query_name} 
    
    try:
        response = requests.get(retrieve_theme_url, headers=headers, params=params)
        amenity_coor = response.json().get("SrchResults", [])
        df = pd.DataFrame(amenity_coor)
        # Copy the first row's values for 'Theme_Name' and 'Category'
        df.loc[1:, ["Theme_Name", "Category"]] = df.loc[0, ["Theme_Name", "Category"]].values
        
        # Drop the first row
        df = df.iloc[1:].reset_index(drop=True)
        # Keep only necessary columns
        df = df[["NAME", "Category", "LatLng"]]

        df_recre = pd.concat([df_recre, df], ignore_index=True)

    except KeyError:
        print(f"Error retrieving latlng: {response.status_code}, {response.text}")

# Add amenity_type
df_recre["amenity_type"] = df_recre["Category"].apply(
    lambda x: "recreation" if x in ["Culture", "Sports", "Recreation"] else "education")
# Rename column
df_recre.rename(columns={"NAME": "name"}, inplace=True)



In [22]:
# Clean df_recre data

## Find some values in "LatLng" are in list-like format, extract the first coordinate
import ast
# Convert string representation of lists into actual lists and extract the first coordinate pair
df_recre["LatLng"] = df_recre["LatLng"].astype(str)
df_recre["LatLng"] = df_recre["LatLng"].apply(lambda x: ast.literal_eval(x) if x.startswith("[[") else x)
# Extract the first coordinate (if it's a list, otherwise keep the original)
df_recre["LatLng"] = df_recre["LatLng"].apply(lambda x: x[0] if isinstance(x, list) else x)

df_recre["LatLng"] = df_recre["LatLng"].astype(str)
df_recre["LatLng"] = df_recre["LatLng"].str.replace(r"[\[\]]", "", regex=True)
df_recre["LatLng"] = df_recre["LatLng"].astype(str).str.replace(", ", ",", regex=False)

# Split into lat/lon
df_recre[["lon", "lat"]] = df_recre["LatLng"].str.split(",", expand=True)
df_recre["lat"] = df_recre["lat"].astype(float)
df_recre["lon"] = df_recre["lon"].astype(float)

## Some values of lat and lng in "LatLng" are reversed 
flag = (df_recre["lat"] > 2) & (df_recre["lon"] < 100)
# Swap lat and lon for those rows
df_recre.loc[flag, ["lat", "lon"]] = df_recre.loc[flag, ["lon", "lat"]].values

# Update the 'LatLng' column with the corrected format
df_recre["LatLng"] = df_recre["lat"].astype(str) + "," + df_recre["lon"].astype(str)

df_recre = df_recre[["name", "amenity_type", "LatLng", "lat", "lon"]]


In [23]:
# Export to CSV
# df_recre.to_csv(os.path.join("../data", "Amenities_recreation.csv"), index=False)

## Calculating scores for amenities
Haversine Vectorized Method

#### Import data

In [24]:
route_url = "https://www.onemap.gov.sg/api/public/routingsvc/route"
min_distance = 50  # Minimum threshold in meters --> prevent extreme values
lat_range = 0.025  # Approximate ~3km latitude range filter
lon_range = 0.025  # Approximate ~3km longitude range filter

df_unique_hdb_coor = pd.read_csv('../data/raw/hdb_geospatial.csv')
#df_hdb_flats["LatLng"] = df_hdb_flats["lat"].astype(str) + "," + df_hdb_flats["lon"].astype(str)
df_unique_hdb_coor.rename(columns={"latitude": "lat", "longitude": "lon"}, inplace = True)


#### Calculate distance and score

In [25]:
from scipy.spatial.distance import cdist

def haversine(lat1, lon1, lat2, lon2):   
    R = 6371000  # Earth radius in meters
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2 # Squared chord length between the two points
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) # Angular distance in radians

    return R * c  # Distance in meters


def calculate_amenity_score_vectorized(df_unique_hdb_coor, df_amenities):

    amenity_type = df_amenities["amenity_type"].iloc[0]
 
    df_unique_hdb_coor["lat"] = df_unique_hdb_coor["lat"].astype(float)
    df_unique_hdb_coor["lon"] = df_unique_hdb_coor["lon"].astype(float)
    df_amenities["lat"] = df_amenities["lat"].astype(float)
    df_amenities["lon"] = df_amenities["lon"].astype(float)


    # Extract coordinates to NumPy arrays
    hdb_coords = df_unique_hdb_coor[["lat", "lon"]].to_numpy()
    amenity_coords = df_amenities[["lat", "lon"]].to_numpy()

    # Compute pairwise distances using Haversine formula
    method = lambda u, v: haversine(u[0], u[1], v[0], v[1])
    distance_matrix = cdist(hdb_coords, amenity_coords, method) # row: hdb, column: amenity

    # Filter distances > 3000 meters (5km)
    flag_matrix = distance_matrix <= 3000

    # Apply scoring formula: score = 1 * 1000 / (distance + epsilon)
    scores_matrix = np.where(flag_matrix, 1 * 1000 / np.maximum(distance_matrix, min_distance), 0) # np.where(flag_matrix, if True, else False)

    # Sum scores for each HDB flat
    df_unique_hdb_coor[f"{amenity_type}_score"] = scores_matrix.sum(axis=1) # Sums across the columns for each row

    return df_unique_hdb_coor

""""
def populate_amenity_score(df_hdb_amenity_unique, df_hdb_flats_amenity, amenity_type):
    df_hdb_flats_amenity[f"{amenity_type}_score"] = df_hdb_flats_amenity["LatLng"].map(df_hdb_amenity_unique.set_index("LatLng")[f"{amenity_type}_score"])
    
    return df_hdb_flats_amenity
"""


'"\ndef populate_amenity_score(df_hdb_amenity_unique, df_hdb_flats_amenity, amenity_type):\n    df_hdb_flats_amenity[f"{amenity_type}_score"] = df_hdb_flats_amenity["LatLng"].map(df_hdb_amenity_unique.set_index("LatLng")[f"{amenity_type}_score"])\n    \n    return df_hdb_flats_amenity\n'

#### Apply method to different amenity category

In [26]:
df_amenity_score = pd.DataFrame()

In [27]:
df_amenity_score = calculate_amenity_score_vectorized(df_unique_hdb_coor, df_school) # 337
# df_hdb_flats_amenity = populate_amenity_score(df_temp, df_hdb_flats_amenity, "education")

In [28]:
df_shopping = pd.concat([df_mall, df_superm], ignore_index=True)
df_amenity_score = calculate_amenity_score_vectorized(df_unique_hdb_coor, df_shopping) # 155+607
# df_hdb_flats_amenity = populate_amenity_score(df_temp, df_hdb_flats_amenity, "shopping")

In [29]:
df_amenity_score = calculate_amenity_score_vectorized(df_unique_hdb_coor, df_hawkerc) # 125
# df_hdb_flats_amenity = populate_amenity_score(df_temp, df_hdb_flats_amenity, "food")

In [30]:
df_amenity_score = calculate_amenity_score_vectorized(df_unique_hdb_coor, df_health) # 1028
# df_hdb_flats_amenity = populate_amenity_score(df_temp, df_hdb_flats_amenity, "healthcare")

In [31]:
df_amenity_score = calculate_amenity_score_vectorized(df_unique_hdb_coor, df_recre) # 956
# df_hdb_flats_amenity = populate_amenity_score(df_temp, df_hdb_flats_amenity, "recreation")

In [32]:
# Save output
# df_amenity_score.to_csv(os.path.join("../data/raw", "geospatial+amenity_score.csv"), index=False)

In [33]:
df_amenity_score.info()
df_amenity_score = df_amenity_score[["lat", "lon", "education_score", "shopping_score", "food_score", "healthcare_score", "recreation_score"]]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9623 entries, 0 to 9622
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   street_name           9623 non-null   object 
 1   lat                   9623 non-null   float64
 2   lon                   9623 non-null   float64
 3   nearest_mrt_distance  9623 non-null   float64
 4   nearest_bus_distance  9623 non-null   float64
 5   education_score       9623 non-null   float64
 6   shopping_score        9623 non-null   float64
 7   food_score            9623 non-null   float64
 8   healthcare_score      9623 non-null   float64
 9   recreation_score      9623 non-null   float64
dtypes: float64(9), object(1)
memory usage: 751.9+ KB


In [None]:
df_resale = pd.read_csv('../data/cleaned/resale_price_cleaned_1.csv')
df_resale_new = df_resale.merge(df_amenity_score, on=['lat', 'lon'], how='left')
df_resale_new.to_csv("../data/cleaned/resale_price_cleaned_2.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/resale_price_cleaned_1.csv'