# Geocoding HDB Property Info Dataset using OneMap API

## Step 1: Install Required Libraries
Install the necessary Python packages. Uncomment and run if not installed.

In [3]:
!pip install pandas requests tqdm

Defaulting to user installation because normal site-packages is not writeable
[0m

## Step 2: Import Libraries

In [13]:
import pandas as pd
import requests
from tqdm import tqdm
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

## Step 3: Define Geocoding Function

In [12]:
ONEMAP_API_URL = "https://developers.onemap.sg/commonapi/search"

def requests_retry_session(retries=3, backoff_factor=0.5, status_forcelist=(500, 502, 504)):
    """Create a session with retry strategy for HTTP requests."""
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def get_geocode(block, street_name):
    """Fetch postal code, latitude, and longitude from OneMap API given block and street with retries."""
    address = f"{block} {street_name} Singapore"
    params = {
        'searchVal': address,
        'returnGeom': 'Y',
        'getAddrDetails': 'Y',
        'pageNum': 1
    }
    try:
        session = requests_retry_session()
        response = session.get(ONEMAP_API_URL, params=params, timeout=5)
        if response.status_code == 200:
            results = response.json().get('results', [])
            if results:
                result = results[0]
                return result.get('POSTAL', ''), result.get('LATITUDE', ''), result.get('LONGITUDE', '')
        return '', '', ''
    except Exception as e:
        print(f"Error fetching geocode for {address}: {e}")
        return '', '', ''


In [None]:
ONEMAP_API_URL = "https://developers.onemap.sg/commonapi/search"

def get_geocode(block, street_name):
    """Fetch postal code, latitude, and longitude from OneMap API given block and street."""
    address = f"{block} {street_name} Singapore"
    params = {
        'searchVal': address,
        'returnGeom': 'Y',
        'getAddrDetails': 'Y',
        'pageNum': 1
    }
    try:
        response = requests.get(ONEMAP_API_URL, params=params)
        if response.status_code == 200:
            results = response.json().get('results', [])
            if results:
                result = results[0]
                return result.get('POSTAL', ''), result.get('LATITUDE', ''), result.get('LONGITUDE', '')
        return '', '', ''
    except Exception as e:
        print(f"Error fetching geocode for {address}: {e}")
        return '', '', ''


## Step 4: Main Function to Process CSV

In [14]:
# def process_csv(input_csv, output_csv):
#     """Reads the input CSV, appends geocode data, and writes to output CSV."""
#     df = pd.read_csv(input_csv)
    
#     # Create empty columns for postal code, latitude, longitude
#     df['postal_code'] = ''
#     df['latitude'] = ''
#     df['longitude'] = ''

#     # Process each row with progress bar
#     for idx, row in tqdm(df.iterrows(), total=len(df), desc="Fetching Geocodes"):
#         postal, lat, lon = get_geocode(row['block'], row['street_name'])
#         df.at[idx, 'postal_code'] = postal
#         df.at[idx, 'latitude'] = lat
#         df.at[idx, 'longitude'] = lon
#         time.sleep(0.5)  # Prevent rate-limiting

#     # Save updated CSV
#     df.to_csv(output_csv, index=False)
#     print(f"Updated CSV saved as {output_csv}")# Geocoding HDB Property Info Dataset using OneMap API (with Retry Mechanism)

def process_csv(input_csv, output_csv):
    """Reads the input CSV, appends geocode data, and writes to output CSV."""
    df = pd.read_csv(input_csv)

    # Create empty columns for postal code, latitude, longitude
    df['postal_code'] = ''
    df['latitude'] = ''
    df['longitude'] = ''

    # Process each row with progress bar
    failed_addresses = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Fetching Geocodes"):
        postal, lat, lon = get_geocode(row['block'], row['street_name'])
        df.at[idx, 'postal_code'] = postal
        df.at[idx, 'latitude'] = lat
        df.at[idx, 'longitude'] = lon
        
        if not postal:
            failed_addresses.append(f"{row['block']} {row['street_name']}")

        time.sleep(0.5)  # Prevent rate-limiting

    # Save updated CSV
    df.to_csv(output_csv, index=False)
    print(f"Updated CSV saved as {output_csv}")

    if failed_addresses:
        print("The following addresses failed to fetch:")
        for addr in failed_addresses:
            print(addr)


In [20]:
#Import necessary libraries
import requests
import time
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
#Define functions for API calling and geocoding
def call_loop(url_given, attempts=3, delay=5):
    """
    Call the given URL with retries if response status is not 200.
    """
    response_status_code = 1
    for _ in range(attempts + 1):
        if response_status_code != 200:
            if response_status_code != 1:
                time.sleep(delay)
            response = requests.get(url_given)
            response_status_code = response.status_code
        else:
            return response
    print(f"For URL: {url_given}")
    print(f"Response Status Code: {response_status_code}")
    return None

def collect_results(full_add, attempts=3, delay=5):
    """
    Collects all the result from OneMap API into a list of dictionaries.
    Uses the full address (with spaces replaced by '+') to form the URL.
    """
    url_p1 = "https://www.onemap.gov.sg/api/common/elastic/search?searchVal=+"
    url_p2 = "+&returnGeom=Y&getAddrDetails=Y"
    final_url = url_p1 + full_add + url_p2
    
    results = []
    response = call_loop(final_url, attempts, delay)
    if response is not None:
        # Using response.json() is safer than eval(response.text)
        feedback = response.json()
        found = feedback.get('found', 0)
        totalpages = feedback.get('totalNumPages', 1)
    else:
        print(f"Exited collect_results function as there is no response for address: {full_add}")
        return results  # Returns empty list
    
    if found == 0:
        return results  # No results found
    else:
        results = feedback.get('results', [])
        # If there are multiple pages, get additional results
        if found > 1 and totalpages > 1:
            for x in range(totalpages - 1):
                page_num = x + 2
                url_with_page = final_url + f"&pageNum={page_num}"
                temp_response = call_loop(url_with_page, attempts, delay)
                if temp_response is not None:
                    temp_feedback = temp_response.json()
                    temp_results = temp_feedback.get('results', [])
                    results.extend(temp_results)
    return results

#Read the input CSV and define a function to get geocoding info per row
df = pd.read_csv('raw.csv')

def get_geocode_info(row):
    """
    Constructs a full address from the row and uses OneMap API to retrieve
    postal code and coordinates.
    """
    # Construct the full address using block and street_name.
    # (You can modify this if you need to include town or other parts.)
    full_address = f"{row['block']} {row['street_name']}"
    # Replace spaces with '+' for URL encoding
    full_address_encoded = full_address.replace(" ", "+")
    
    results = collect_results(full_address_encoded)
    if results:
        # Choose the first result (assumed best match)
        res = results[0]
        # Extract postal code and coordinates (keys may be "POSTAL", "X", "Y")
        postal_code = res.get("POSTAL", "")
        longitude = res.get("X", "")
        latitude = res.get("Y", "")
        return pd.Series([postal_code, longitude, latitude])
    else:
        # No geocoding result found; return empty values.
        return pd.Series(["", "", ""])

In [21]:
# Apply the geocoding function to each row and save the results
df[['postal_code', 'longitude', 'latitude']] = df.progress_apply(get_geocode_info, axis=1)
df.to_csv('geocoded.csv', index=False)
print("Geocoding complete. Results saved to geocoded.csv")


100%|██████████| 199327/199327 [6:54:31<00:00,  8.01it/s]   


Geocoding complete. Results saved to geocoded.csv
