In [1]:
# Getting latest HDB resale prices from the API version of this dataset: https://data.gov.sg/collections/189/view

import requests

dataset_id = "d_8b84c4ee58e3cfc0ece0d773c8ca6abc"  # make sure this is really a CKAN resource_id
base_url = "https://data.gov.sg/api/action/datastore_search"

all_records = []
offset = 0
limit = 1000  

while True:
    params = {
        "resource_id": dataset_id,
        "limit": limit,
        "offset": offset,
    }
    print("Requesting", base_url, params)
    resp = requests.get(base_url, params=params)
    data = resp.json()

    # grab this page of rows
    records = data["result"]["records"]

    if not records:
        break  # no more rows

    all_records.extend(records)

    # move to next page
    offset += limit

print("Total rows:", len(all_records))

Requesting https://data.gov.sg/api/action/datastore_search {'resource_id': 'd_8b84c4ee58e3cfc0ece0d773c8ca6abc', 'limit': 1000, 'offset': 0}
Requesting https://data.gov.sg/api/action/datastore_search {'resource_id': 'd_8b84c4ee58e3cfc0ece0d773c8ca6abc', 'limit': 1000, 'offset': 1000}
Requesting https://data.gov.sg/api/action/datastore_search {'resource_id': 'd_8b84c4ee58e3cfc0ece0d773c8ca6abc', 'limit': 1000, 'offset': 2000}
Requesting https://data.gov.sg/api/action/datastore_search {'resource_id': 'd_8b84c4ee58e3cfc0ece0d773c8ca6abc', 'limit': 1000, 'offset': 3000}
Requesting https://data.gov.sg/api/action/datastore_search {'resource_id': 'd_8b84c4ee58e3cfc0ece0d773c8ca6abc', 'limit': 1000, 'offset': 4000}
Requesting https://data.gov.sg/api/action/datastore_search {'resource_id': 'd_8b84c4ee58e3cfc0ece0d773c8ca6abc', 'limit': 1000, 'offset': 5000}
Requesting https://data.gov.sg/api/action/datastore_search {'resource_id': 'd_8b84c4ee58e3cfc0ece0d773c8ca6abc', 'limit': 1000, 'offset': 6

In [None]:
import pandas as pd
import numpy as np

# -------------------------------------------------------------------
# 1. Turn API records into a DataFrame and clean basic columns
#    (assumes `all_records` is already populated from the API)
# -------------------------------------------------------------------
df = pd.DataFrame(all_records)

# Make sure key numeric fields are numeric
for col in ["resale_price", "floor_area_sqm", "lease_commence_date"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Parse month into a proper datetime
df["month"] = pd.to_datetime(df["month"], errors="coerce")

# Drop rows with missing essential fields
df = df.dropna(subset=["resale_price", "floor_area_sqm", "lease_commence_date", "month"])

# -------------------------------------------------------------------
# 2. Helper: z-score within a group
# -------------------------------------------------------------------
def add_group_zscore(frame, group_cols, value_col, new_col):
    """
    Adds a z-score column based on mean and std within each group.
    """
    grouped = frame.groupby(group_cols, observed=False)[value_col]
    mean = grouped.transform("mean")
    std = grouped.transform("std")

    # avoid division by zero
    frame[new_col] = (frame[value_col] - mean) / std.replace(0, np.nan)
    return frame

# -------------------------------------------------------------------
# 3. PRICE SHOCK: flat vs similar flats nearby
#    (town + flat_type as the "similar flats" universe)
# -------------------------------------------------------------------
df = add_group_zscore(
    df,
    group_cols=["town", "flat_type"],
    value_col="resale_price",
    new_col="z_town_flat"
)

# -------------------------------------------------------------------
# 4. OUTLIER JUMP: flat vs its own block history
#    (town + block + street_name + flat_type)
# -------------------------------------------------------------------
df = add_group_zscore(
    df,
    group_cols=["town", "block", "street_name", "flat_type"],
    value_col="resale_price",
    new_col="z_block"
)

# -------------------------------------------------------------------
# 5. MARKET DEFIER: high-priced sale in a cooling month
#    - Get month-on-month change in overall median price
# -------------------------------------------------------------------
monthly_median = (
    df.groupby("month", observed=False)["resale_price"]
      .median()
      .sort_index()
)

monthly_change = monthly_median.pct_change()

# Map month-level % change back to each row
df = df.sort_values("month")
df["mo_change"] = df["month"].map(monthly_change)

# Cooling strength: only when market is going down
df["cooling_strength"] = np.where(df["mo_change"] < 0, -df["mo_change"], 0)

# -------------------------------------------------------------------
# 6. UNEXPLAINABLE SPIKE: residual after controlling for
#    floor area, age, town, and flat type (very rough model)
# -------------------------------------------------------------------
# Age of flat at point of resale (approx year-based)
df["year"] = df["month"].dt.year
df["age_years"] = df["year"] - df["lease_commence_date"]

# Bin size and age to get coarse "expected price" groups
df["size_bin"] = pd.cut(
    df["floor_area_sqm"],
    bins=[0, 40, 60, 80, 100, 130, 200],
    include_lowest=True
)

df["age_bin"] = pd.cut(
    df["age_years"],
    bins=[0, 10, 20, 30, 40, 50, 60, 80, 120],
    include_lowest=True
)

# Expected price = median within (town, flat_type, size_bin, age_bin)
group_cols_model = ["town", "flat_type", "size_bin", "age_bin"]
grouped_model = df.groupby(group_cols_model, observed=False)["resale_price"]

df["expected_price"] = grouped_model.transform("median")

# Residual = actual - expected
df["price_residual"] = df["resale_price"] - df["expected_price"]

# z-score of residual within the same model group
resid_group = df.groupby(group_cols_model, observed=False)["price_residual"]
resid_std = resid_group.transform("std").replace(0, np.nan)
df["z_residual"] = df["price_residual"] / resid_std

# -------------------------------------------------------------------
# 7. Turn each editorial dimension into a 0–1 score
#    (so we can blend them into a 0–100 WTF meter)
# -------------------------------------------------------------------
def norm01(series, q=0.95, min_nonzero=20):
    """
    Normalise a series to 0–1 using the 95th percentile as "max".
    Negative values are clipped at 0.

    FIXED VERSION:
    - If the 95th percentile is zero (common when most values are 0),
      fall back to using the max.
    - Ensures the score has a usable distribution (not all 0s or 1s).
    """
    s = series.clip(lower=0)

    # If everything is zero → all zeros
    if (s > 0).sum() == 0:
        return s * 0

    # Try percentile only if enough non-zero values
    if (s > 0).sum() >= min_nonzero:
        qv = s.quantile(q)
    else:
        qv = 0  # force fallback

    # If percentile fails, fall back to max
    if qv <= 0 or np.isnan(qv):
        max_val = s.max()
        if max_val == 0:
            return s * 0
        return (s / max_val).clip(0, 1)

    # Normal case
    return (s / qv).clip(0, 1)

# 7a. PRICE SHOCK: how far above similar flats nearby
df["price_shock_raw"] = df["z_town_flat"].clip(lower=0)
df["price_shock_score"] = norm01(df["price_shock_raw"])

# 7b. OUTLIER JUMP: how far above other sales in the same block
df["outlier_jump_raw"] = df["z_block"].clip(lower=0)
df["outlier_jump_score"] = norm01(df["outlier_jump_raw"])

# 7c. MARKET DEFIER
df["market_defier_raw"] = df["cooling_strength"] * df["price_shock_raw"]
df["market_defier_score"] = norm01(df["market_defier_raw"])

# 7d. UNEXPLAINABLE SPIKE
df["unexplainable_raw"] = df["z_residual"].clip(lower=0)
df["unexplainable_score"] = norm01(df["unexplainable_raw"])

# -------------------------------------------------------------------
# 8. Blend into a single WTF score (0–100)
#    Editorial weights:
#    - Price Shock      → 35%
#    - Outlier Jump     → 25%
#    - Market Defier    → 15%
#    - Unexplainable    → 25%
# -------------------------------------------------------------------
df["wtf_score"] = (
    df["price_shock_score"] * 0.35 +
    df["outlier_jump_score"] * 0.25 +
    df["market_defier_score"] * 0.15 +
    df["unexplainable_score"] * 0.25
) * 100

df["wtf_score"] = df["wtf_score"].round(1)

# -------------------------------------------------------------------
# 9. Bucket into editorial WTF meter bands
# -------------------------------------------------------------------
def wtf_bucket(score):
    if score < 90:
        return "Mildly interesting"
    elif score < 95:
        return "Hmm, okay, someone wanted this"
    elif score < 99.9:
        return "Valuers stared at the ceiling for a bit"
    else:
        return "Full-blown unicorn sale ripe for public debate"

df["wtf_bucket"] = df["wtf_score"].apply(wtf_bucket)

# -------------------------------------------------------------------
# 10. Filter to WTF flats in the latest rolling month
# -------------------------------------------------------------------
WTF_THRESHOLD = 90  # tweak to taste

wtf_flats = df[df["wtf_score"] >= WTF_THRESHOLD].copy()

latest_month = df["month"].max()
cutoff_month = latest_month - pd.DateOffset(months=1)

recent_wtf_flats = wtf_flats[wtf_flats["month"] >= cutoff_month].copy()

In [37]:
# create a new address column 

recent_wtf_flats["address"] = (
    recent_wtf_flats["block"].astype(str).str.strip() + " " +
    recent_wtf_flats["street_name"].astype(str).str.strip()
)

In [38]:
# Geocode the addresses

import geocoder

# Create empty columns
recent_wtf_flats["lat"] = None
recent_wtf_flats["lng"] = None

for idx, addr in recent_wtf_flats["address"].items():
    query = f"{addr}, Singapore"
    print("Geocoding:", query)

    g = geocoder.arcgis(query)

    if g.ok and g.latlng:
        recent_wtf_flats.at[idx, "lat"] = g.latlng[0]
        recent_wtf_flats.at[idx, "lng"] = g.latlng[1]
    else:
        print("❗Geocoding failed for:", query)

Geocoding: 54 MARINE TER, Singapore
Geocoding: 107 ANG MO KIO AVE 4, Singapore
Geocoding: 14 MARINE TER, Singapore
Geocoding: 424B YISHUN AVE 11, Singapore
Geocoding: 66 MARINE DR, Singapore
Geocoding: 773 YISHUN AVE 3, Singapore
Geocoding: 438C BT BATOK WEST AVE 8, Singapore
Geocoding: 118 ANG MO KIO AVE 4, Singapore
Geocoding: 30 GHIM MOH LINK, Singapore
Geocoding: 359 YISHUN RING RD, Singapore
Geocoding: 459 YISHUN AVE 11, Singapore
Geocoding: 740 YISHUN AVE 5, Singapore
Geocoding: 30 HOLLAND CL, Singapore
Geocoding: 510 ANG MO KIO AVE 8, Singapore
Geocoding: 23 GHIM MOH LINK, Singapore
Geocoding: 32 GHIM MOH LINK, Singapore
Geocoding: 409 PASIR RIS DR 6, Singapore
Geocoding: 675B YISHUN AVE 4, Singapore
Geocoding: 634 PASIR RIS DR 1, Singapore
Geocoding: 89 DAWSON RD, Singapore
Geocoding: 530C PASIR RIS DR 1, Singapore
Geocoding: 273A BISHAN ST 24, Singapore
Geocoding: 700 PASIR RIS DR 10, Singapore
Geocoding: 530B PASIR RIS DR 1, Singapore
Geocoding: 23 GHIM MOH LINK, Singapore
Ge

In [26]:
recent_wtf_flats.to_csv('wtf_flats.csv', index=False)