# HDB Resale Flat Prices - Data Exploration

This notebook documents the exploratory data analysis process that informed the ETL pipeline design.

**Exploration Flow:**
1. Download data from API
2. Explore data structure and quality
3. Profile columns to understand distributions
4. Design and test validation rules
5. Develop and test transformations
6. Migrate working code to `data_operations/`

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime
from dateutil.relativedelta import relativedelta
import hashlib
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')

print("Setup complete!")

## 1. Download Data from API

Testing API connection and data retrieval

In [None]:
# API configuration
API_BASE_URL = "https://data.gov.sg/api/action/datastore_search"
RESOURCE_IDS = [
    "1b702208-44bf-4829-b620-4615ee19b57c",  # 2012-2014
    "83b2fc37-ce8c-4df4-968b-370fd818138b",  # 2015-2016
]

print(f"Will fetch from {len(RESOURCE_IDS)} resources")

In [None]:
# Test fetching from first resource (limited records)
test_params = {
    "resource_id": RESOURCE_IDS[0],
    "limit": 1000,
    "offset": 0
}

response = requests.get(API_BASE_URL, params=test_params, timeout=30)
data = response.json()

if data.get("success"):
    records = data["result"]["records"]
    df_test = pd.DataFrame(records)
    print(f"✓ Fetched {len(df_test)} records successfully")
    print(f"\nColumns: {list(df_test.columns)}")
    df_test.head()
else:
    print("Error fetching data:", data)

## 2. Explore Data Structure

Understand the schema, data types, and basic statistics

In [None]:
# Data types and info
df_test.info()

In [None]:
# Check for nulls
null_counts = df_test.isnull().sum()
null_pct = (null_counts / len(df_test) * 100).round(2)
pd.DataFrame({'Null Count': null_counts, 'Null %': null_pct})

In [None]:
# Explore each column
for col in df_test.columns:
    print(f"\n{'='*80}")
    print(f"Column: {col}")
    print(f"Type: {df_test[col].dtype}")
    print(f"Unique values: {df_test[col].nunique()}")
    
    if df_test[col].dtype == 'object' and df_test[col].nunique() <= 20:
        print(f"\nValue counts:")
        print(df_test[col].value_counts())
    elif pd.api.types.is_numeric_dtype(df_test[col]):
        print(f"\nStatistics:")
        print(df_test[col].describe())

## 3. Design Validation Rules

Based on exploration, determine what validation rules to apply

In [None]:
# Categorical fields - extract unique values for validation
categorical_fields = ['town', 'flat_type', 'flat_model', 'storey_range']

for field in categorical_fields:
    if field in df_test.columns:
        unique_vals = sorted(df_test[field].dropna().unique())
        print(f"\n{field}: {len(unique_vals)} unique values")
        print(unique_vals)

In [None]:
# Date range validation
if 'month' in df_test.columns:
    print(f"Date range: {df_test['month'].min()} to {df_test['month'].max()}")

## 4. Test Duplicate Detection

Check if duplicates exist and test deduplication logic

In [None]:
# Define composite key (all columns except resale_price)
key_columns = [col for col in df_test.columns if col != 'resale_price']
print(f"Composite key columns ({len(key_columns)}): {key_columns}")

In [None]:
# Check for duplicates
duplicates = df_test.duplicated(subset=key_columns, keep=False)
num_duplicates = duplicates.sum()
print(f"Found {num_duplicates} duplicate records")

if num_duplicates > 0:
    print("\nSample duplicates:")
    df_test[duplicates].head(10)

In [None]:
# Test deduplication - keep higher price
df_sorted = df_test.sort_values('resale_price', ascending=False)
df_deduped = df_sorted.drop_duplicates(subset=key_columns, keep='first')

print(f"Original: {len(df_test)} records")
print(f"After dedup: {len(df_deduped)} records")
print(f"Removed: {len(df_test) - len(df_deduped)} records")

## 5. Test Outlier Detection

Test different methods for detecting anomalous prices

In [None]:
# Visualize price distribution
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
df_test['resale_price'].hist(bins=50)
plt.title('Resale Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
df_test.boxplot(column='resale_price', by='flat_type', figsize=(14, 5))
plt.title('Price by Flat Type')
plt.suptitle('')

plt.tight_layout()
plt.show()

In [None]:
# Test IQR method
Q1 = df_test['resale_price'].quantile(0.25)
Q3 = df_test['resale_price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

iqr_outliers = (df_test['resale_price'] < lower_bound) | (df_test['resale_price'] > upper_bound)
print(f"IQR Method: {iqr_outliers.sum()} outliers ({iqr_outliers.sum()/len(df_test)*100:.2f}%)")
print(f"Bounds: {lower_bound:,.0f} - {upper_bound:,.0f}")

In [None]:
# Test Z-score method
z_scores = np.abs((df_test['resale_price'] - df_test['resale_price'].mean()) / df_test['resale_price'].std())
z_outliers = z_scores > 3
print(f"Z-Score Method: {z_outliers.sum()} outliers ({z_outliers.sum()/len(df_test)*100:.2f}%)")

## 6. Test Transformations

### 6.1 Remaining Lease Calculation

In [None]:
# Test remaining lease calculation
def calculate_remaining_lease(lease_commence_date, reference_date=None):
    if reference_date is None:
        reference_date = datetime.now()
    
    lease_start = datetime(year=int(lease_commence_date), month=1, day=1)
    lease_end = lease_start + relativedelta(years=99)
    remaining = relativedelta(lease_end, reference_date)
    
    years = remaining.years
    months = remaining.months
    
    if years < 0:
        return "0 years 0 months"
    
    return f"{years} years {months} months"

# Test
test_cases = [1980, 1990, 2000, 2010]
for year in test_cases:
    result = calculate_remaining_lease(year)
    print(f"Lease started {year}: {result}")

In [None]:
# Apply to dataframe
if 'lease_commence_date' in df_test.columns:
    df_test['remaining_lease'] = df_test['lease_commence_date'].apply(calculate_remaining_lease)
    print(df_test[['lease_commence_date', 'remaining_lease']].head(10))

### 6.2 Resale Identifier Creation

In [None]:
# Helper functions for identifier creation
def extract_block_digits(block):
    """Extract first 3 digits from block, left-padded"""
    digits_only = re.sub(r'\D', '', str(block))
    if not digits_only:
        return "000"
    return digits_only[:3].zfill(3)

def extract_price_digits(average_price):
    """Extract 1st and 2nd digit from average price"""
    price_str = str(int(average_price))
    if len(price_str) >= 2:
        return price_str[:2]
    else:
        return price_str.zfill(2)

def extract_month_digits(month_str):
    """Extract month as 2 digits"""
    try:
        date_obj = datetime.strptime(month_str, "%Y-%m")
        return date_obj.strftime("%m")
    except ValueError:
        return "00"

def get_first_char_town(town):
    """Get first character of town"""
    if not town:
        return "X"
    return str(town).strip().upper()[0]

# Test each component
print("Block tests:")
print(f"  '19' → '{extract_block_digits('19')}'")
print(f"  '123A' → '{extract_block_digits('123A')}'")
print(f"  '5' → '{extract_block_digits('5')}'")

print("\nPrice tests:")
print(f"  230000 → '{extract_price_digits(230000)}'")
print(f"  450000 → '{extract_price_digits(450000)}'")

print("\nMonth tests:")
print(f"  '2012-01' → '{extract_month_digits('2012-01')}'")
print(f"  '2016-12' → '{extract_month_digits('2016-12')}'")

print("\nTown tests:")
print(f"  'Ang Mo Kio' → '{get_first_char_town('Ang Mo Kio')}'")
print(f"  'Bedok' → '{get_first_char_town('Bedok')}'")

In [None]:
# Calculate average prices by group
avg_prices = df_test.groupby(['month', 'town', 'flat_type'])['resale_price'].mean().reset_index()
avg_prices.columns = ['month', 'town', 'flat_type', 'avg_resale_price']

print(f"Calculated {len(avg_prices)} average price groups")
avg_prices.head()

In [None]:
# Merge back to main dataframe
df_with_avg = df_test.merge(avg_prices, on=['month', 'town', 'flat_type'], how='left')

# Create identifiers
df_with_avg['resale_identifier'] = df_with_avg.apply(
    lambda row: (
        "S" +
        extract_block_digits(row['block']) +
        extract_price_digits(row['avg_resale_price']) +
        extract_month_digits(row['month']) +
        get_first_char_town(row['town'])
    ),
    axis=1
)

print("Sample identifiers:")
df_with_avg[['block', 'month', 'town', 'avg_resale_price', 'resale_identifier']].head(10)

In [None]:
# Check uniqueness
unique_ids = df_with_avg['resale_identifier'].nunique()
total_records = len(df_with_avg)
print(f"Unique identifiers: {unique_ids}")
print(f"Total records: {total_records}")
if unique_ids == total_records:
    print("✓ All identifiers are unique!")
else:
    print(f"⚠ {total_records - unique_ids} duplicate identifiers")

### 6.3 SHA-256 Hashing

In [None]:
# Test hashing
def hash_sha256(identifier):
    return hashlib.sha256(identifier.encode('utf-8')).hexdigest()

# Test determinism
test_id = "S0192301A"
hash1 = hash_sha256(test_id)
hash2 = hash_sha256(test_id)

print(f"Identifier: {test_id}")
print(f"Hash 1: {hash1}")
print(f"Hash 2: {hash2}")
print(f"\nDeterministic: {hash1 == hash2}")
print(f"Length: {len(hash1)} characters")

In [None]:
# Apply hashing
df_with_avg['resale_identifier_hash'] = df_with_avg['resale_identifier'].apply(hash_sha256)

print("Sample hashes:")
df_with_avg[['resale_identifier', 'resale_identifier_hash']].head()

In [None]:
# Verify hash uniqueness
unique_hashes = df_with_avg['resale_identifier_hash'].nunique()
print(f"Unique hashes: {unique_hashes}")
print(f"Total records: {len(df_with_avg)}")

if unique_hashes == len(df_with_avg):
    print("✓ Hash uniqueness verified!")
else:
    print(f"⚠ Collision detected!")

## 7. Summary

Based on this exploration:

### Validation Rules to Implement:
1. ✅ Categorical validation for town, flat_type, flat_model, storey_range
2. ✅ Date range validation
3. ✅ Null checks for critical fields
4. ✅ Duplicate detection with composite key
5. ✅ Outlier detection using IQR and Z-score methods

### Transformations to Implement:
1. ✅ Remaining lease calculation (99-year assumption)
2. ✅ Resale identifier creation (multi-step logic)
3. ✅ SHA-256 hashing

**Next Step:** Migrate working code to `data_operations/` modules