In [2]:
import pandas as pd
import os

In [3]:
one = pd.read_csv("datasets/HDB/1.csv").drop(columns=["flat_model", "storey_range"])
two = pd.read_csv("datasets/HDB/2.csv").drop(columns=["flat_model", "storey_range"])
three = pd.read_csv("datasets/HDB/3.csv").drop(columns=["flat_model", "storey_range"])
four = pd.read_csv("datasets/HDB/4.csv").drop(
    columns=["flat_model", "storey_range", "remaining_lease"]
)
five = pd.read_csv("datasets/HDB/5.csv").drop(
    columns=["flat_model", "storey_range", "remaining_lease"]
)

hdb = pd.concat([one, two, three, four, five])

# hdb['street_name'] = hdb['street_name'].replace({' RD': ' ROAD',
#                                                  ' DR': ' DRIVE',
#                                                  ' AVE': ' AVENUE',
#                                                  ' CRES': ' CRESCENT',
#                                                  ' ST': ' STREET',
#                                                  ' CL': ' CLOSE',
#                                                  'JLN': 'JALAN',
#                                                  'BT ': 'BUKIT ',
#                                                  'UPP ': 'UPPER ',
#                                                  'NTH': 'NORTH',
#                                                  'STH': 'SOUTH',
#                                                  "C'WEALTH": 'COMMONWEALTH'}, regex=True)

# hdb.loc[(hdb.block == '309') & (hdb.street_name == 'ANG MO KIO AVE 1'), 'block'] = '309A'
hdb["flat_type"] = hdb["flat_type"].replace("MULTI-GENERATION", "MULTI GENERATION")
hdb["remaining lease"] = 99 - (2024 - hdb.lease_commence_date)

hdb["address"] = hdb["block"] + " " + hdb["street_name"]

In [16]:
addresses = hdb["address"].unique().tolist()

In [17]:
import asyncio
import aiohttp
from tqdm.asyncio import tqdm

url = "https://www.onemap.gov.sg/api/common/elastic/"


async def get_hdb_data(address, session, progress_bar=None):
    params = {"searchVal": address, "returnGeom": "Y", "getAddrDetails": "Y"}
    try:
        async with session.get(url, params=params) as response:
            data = await response.json()
            if data["found"] != 0:
                result = data["results"][0]
                if progress_bar:
                    progress_bar.update(1)
                return {**{"QUERY": address}, **result}
            if progress_bar:
                progress_bar.update(1)
            return {"QUERY": address, "found": 0}
    except Exception as e:
        if progress_bar:
            progress_bar.update(1)
        return {"QUERY": address, "ERROR": "REQUEST FAILED"}


async def main(addresses):
    results = []
    retry_addresses = addresses  # Initial set of addresses to try
    async with aiohttp.ClientSession() as session:
        progress_bar = tqdm(total=len(addresses), desc="Fetching data")
        while retry_addresses:
            tasks = [
                get_hdb_data(address, session, progress_bar)
                for address in retry_addresses
            ]
            batch_results = await asyncio.gather(*tasks)

            retry_addresses = (
                []
            )  # Reset retry_addresses list for the next iteration for any errors

            for res in batch_results:
                if res is not None:
                    if (
                        "ERROR" in res
                    ):  # Check if the response is an error, without updating progress
                        retry_addresses.append(res["QUERY"])  # Add address for retry
                    elif res.get("found", 1) == 0:  # Address has no data but no error
                        results.append(res)
                    else:  # Success case
                        results.append(res)
        progress_bar.close()

    if results:
        # Convert list of dicts to DataFrame, omitting error entries for presentation
        results_df = pd.DataFrame([res for res in results if "ERROR" not in res])
        return results_df.drop(columns="found")
    else:
        print("No results found")


# To run this in a notebook or async environment:
df = await main(addresses)

Fetching data: 9830it [01:22, 119.83it/s]                          


In [23]:
df.to_csv("datasets/HDB/addresses.csv")

In [32]:
hdb_complete = hdb.merge(df, left_on="address", right_on="QUERY", how="left")
hdb_complete.to_csv("datasets/HDB/hdb_complete.csv")