# HDB Resale Prices — Data Exploration

Mirrors the ETL pipeline flow. Code here is prototyped first, then migrated to `data_operations/`.

**Flow:** Download → Merge → DQC → Separate valid/failed → Transform

In [None]:
import sys
sys.path.insert(0, '..')

import requests
import pandas as pd
import hashlib
from datetime import datetime
from dateutil.relativedelta import relativedelta

from config.CONFIG_hdb_resales_price import API_BASE_URL, DATASET_ID, RAW_DATA_DIR
from config.DQC_hdb_resales_price import DQ_CHECKS, DUPLICATE_CHECK, RESALE_PRICE_OUTLIER_CHECK

## 1. Download Data
Same approach as `fetch_and_save_from_api()` — one call per dataset.

In [None]:
def download(resource_id):
    url = f"{API_BASE_URL}?resource_id={resource_id}"
    data = requests.get(url, timeout=30).json()
    return pd.DataFrame(data["result"]["records"])

# Download all datasets
raw_dfs = {name: download(rid) for name, rid in DATASET_ID.items()}

for name, df in raw_dfs.items():
    print(f"{name:15s}  {len(df):>7,} rows  cols={list(df.columns)}")

## 2. Merge
Same as `merge_raw_files()` — concat all datasets, keep all columns.

In [None]:
df = pd.concat(raw_dfs.values(), ignore_index=True, sort=False)
print(f"Merged: {len(df):,} rows, {df.shape[1]} columns")
df.head()

In [None]:
# Quick overview
df.info()

## 3. DQC — same checks as the DAG

Each check returns a boolean mask (True = row failed). We accumulate a `fail_sum` per row.
In production each check runs as a separate Airflow task writing its 0/1 result to its own file.

In [None]:
# Check functions (mirrors data_operations/validate.py)

def check_null(df, column):
    return df[column].isna()

def check_categorical(df, column, allowed_values):
    return ~df[column].isin(allowed_values)

def check_string_format(df, column, pattern):
    return ~df[column].astype(str).str.match(pattern, na=False)

def check_date_format(df, column, fmt):
    return pd.to_datetime(df[column].astype(str), format=fmt, errors="coerce").isna()

def check_duplicates(df, key_columns=None):
    if key_columns is None:
        key_columns = [c for c in df.columns if c != "resale_price"]
    df_sorted = df.sort_values("resale_price", ascending=False)
    keep_mask = ~df_sorted.duplicated(subset=key_columns, keep="first")
    return (~keep_mask.sort_index())

def check_resale_price_outlier(df, column, threshold_pct, group_by):
    group_mean = df.groupby(group_by)[column].transform("mean")
    return ~df[column].between(group_mean * (1 - threshold_pct), group_mean * (1 + threshold_pct))

In [None]:
# DQC Group 1: run all DQ_CHECKS from config
# In the DAG each (check_type, column) pair becomes one task

df["fail_sum"] = 0

for check_type, check_config in DQ_CHECKS.items():
    items = [(col, {}) for col in check_config] if isinstance(check_config, list) else check_config.items()
    for column, params in items:
        if check_type == "null":
            mask = check_null(df, column)
        elif check_type == "categorical":
            mask = check_categorical(df, column, **params)
        elif check_type == "string_format":
            mask = check_string_format(df, column, **params)
        elif check_type == "date_format":
            mask = check_date_format(df, column, **params)

        fails = mask.sum()
        df["fail_sum"] += mask.astype(int)
        print(f"{check_type:15s} | {column:25s} | {fails:6,} fails")

In [None]:
# DQC Group 2: duplicates
mask = check_duplicates(df, **DUPLICATE_CHECK)
dup_fails = mask.sum()
df["fail_sum"] += mask.astype(int)
print(f"Duplicates: {dup_fails:,} rows flagged")

In [None]:
# DQC Group 3: resale price outlier
df["resale_price"] = pd.to_numeric(df["resale_price"], errors="coerce")

mask = check_resale_price_outlier(df, **RESALE_PRICE_OUTLIER_CHECK)
outlier_fails = mask.sum()
df["fail_sum"] += mask.astype(int)
print(f"Price outliers: {outlier_fails:,} rows flagged")

## 4. Separate Valid / Non-valid
Same as `separate_valid_failed()` — rows with `fail_sum > 0` failed at least one check.

In [None]:
df_valid     = df[df["fail_sum"] == 0].drop(columns=["fail_sum"])
df_non_valid = df[df["fail_sum"] > 0]

print(f"Valid:     {len(df_valid):,}")
print(f"Non-valid: {len(df_non_valid):,}")

df_valid.head()

In [None]:
# Inspect non-valid rows
df_non_valid.head(20)

## 5. Transform
Same three transformations as `data_operations/transform.py`.

In [None]:
# Transformation 1: Remaining lease
reference_date = datetime.now()

df_valid["remaining_lease"] = df_valid["lease_commence_date"].apply(
    lambda y: "{} years {} months".format(
        max(relativedelta(datetime(int(y), 1, 1) + relativedelta(years=99), reference_date).years, 0),
        max(relativedelta(datetime(int(y), 1, 1) + relativedelta(years=99), reference_date).months, 0),
    )
)
df_valid[["lease_commence_date", "remaining_lease"]].head()

In [None]:
# Transformation 2: Resale identifier
df_valid["resale_price"] = pd.to_numeric(df_valid["resale_price"])

avg = df_valid.groupby(["month", "town", "flat_type"])["resale_price"].mean().rename("avg_price")
df_valid = df_valid.join(avg, on=["month", "town", "flat_type"])

block_digits = df_valid["block"].astype(str).str.replace(r"\D", "", regex=True).str[:3].str.zfill(3)
price_digits = df_valid["avg_price"].astype(int).astype(str).str[:2].str.zfill(2)
month_digits = pd.to_datetime(df_valid["month"], format="%Y-%m").dt.strftime("%m")
town_char    = df_valid["town"].str.strip().str[0].str.upper()

df_valid["resale_identifier"] = "S" + block_digits + price_digits + month_digits + town_char
df_valid = df_valid.drop(columns=["avg_price"])

print(f"Unique identifiers: {df_valid['resale_identifier'].nunique():,} / {len(df_valid):,} rows")
df_valid[["block", "month", "town", "resale_price", "resale_identifier"]].head()

In [None]:
# Transformation 3: Hash identifier
df_valid["resale_identifier_hash"] = df_valid["resale_identifier"].apply(
    lambda x: hashlib.sha256(x.encode()).hexdigest()
)

print(f"Unique hashes: {df_valid['resale_identifier_hash'].nunique():,}")
df_valid[["resale_identifier", "resale_identifier_hash"]].head()

In [None]:
# Final outputs
df_transformed = df_valid.drop(columns=["resale_identifier_hash"])
df_hashed      = df_valid.drop(columns=["resale_identifier"])

print("transformed:", df_transformed.shape)
print("hashed:     ", df_hashed.shape)