# GCAP3226 Datasets Starter

This notebook loads and explores example datasets for three focus areas: Transport, Air Quality, and Housing. It prints progress messages and handles missing files gracefully so students can run it without edits.

## 1) Set Up Environment and Imports

We'll import core libraries, configure plotting, and set display options for pandas.

In [None]:
# Imports and basic setup
import os
import sys
from pathlib import Path
import logging

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Configure logging and display
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
sns.set_theme(style='whitegrid')
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 120)

print("Environment ready. Pandas version:", pd.__version__)

## 2) Define Configuration and Constants

We centralize paths and simple toggles here.

In [None]:
from dataclasses import dataclass

ROOT = Path('/Users/simonwang/Documents/Usage/VibeCoding/DailyAssistant/projects/GCAP3226')
DATA_ROOT = ROOT / 'course_materials/resources/datasets/open_data'

@dataclass
class Config:
    transport_dir: Path = DATA_ROOT / 'transport'
    air_quality_dir: Path = DATA_ROOT / 'air_quality'
    housing_dir: Path = DATA_ROOT / 'housing'
    sample_rows: int = 5000
    random_seed: int = 42

CFG = Config()
np.random.seed(CFG.random_seed)

print('Using data root:', DATA_ROOT)
for name, p in [('Transport', CFG.transport_dir), ('Air Quality', CFG.air_quality_dir), ('Housing', CFG.housing_dir)]:
    print(f' - {name}:', p, '| exists =', p.exists())

## 3) Load or Generate Data (Helpers)

Functions to pick a CSV and read it. If none found, we can generate synthetic data for demo plots.

In [None]:
from typing import Optional

def pick_csv(folder: Path, prefer_names: Optional[list[str]] = None) -> Optional[Path]:
    """Pick a CSV file from folder. If prefer_names provided, try to match first."""
    if not folder.exists():
        return None
    csvs = sorted(folder.glob('*.csv'))
    if not csvs:
        return None
    if prefer_names:
        lowered = {p.name.lower(): p for p in csvs}
        for name in prefer_names:
            if name.lower() in lowered:
                return lowered[name.lower()]
    return csvs[0]


def safe_load_csv(path: Optional[Path], nrows: Optional[int] = None) -> Optional[pd.DataFrame]:
    if path is None:
        logging.warning('No CSV path provided.')
        return None
    try:
        print(f'Reading: {path}')
        df = pd.read_csv(path, low_memory=False, nrows=nrows)
        print('Loaded shape:', df.shape)
        return df
    except Exception as e:
        logging.error('Failed to read %s: %s', path, e)
        return None


def quick_summary(df: Optional[pd.DataFrame]):
    if df is None:
        print('No data to summarize.')
        return
    display(df.head(5))
    print('\nInfo:')
    print(df.info())
    print('\nDescribe (all dtypes):')
    with pd.option_context('mode.use_inf_as_na', True):
        display(df.describe(include='all', datetime_is_numeric=True).transpose().head(20))


def first_numeric_column(df: pd.DataFrame) -> Optional[str]:
    for col in df.select_dtypes(include=[np.number]).columns:
        return col
    return None

## 4) Implement Core Logic Functions

For this starter, core logic is simple: pick a dataset, load a sample, summarize, and make one quick plot.

In [None]:
def summarize_and_plot(df: Optional[pd.DataFrame], title: str = 'Dataset Summary'):
    print(f'\n=== {title} ===')
    if df is None or df.empty:
        print('No data available for plotting.')
        return
    quick_summary(df)
    num_col = first_numeric_column(df)
    if num_col:
        print(f'Plotting histogram for numeric column: {num_col}')
        plt.figure(figsize=(8, 4))
        sns.histplot(df[num_col].dropna(), bins=30, kde=False)
        plt.title(f'{title}: {num_col} distribution')
        plt.xlabel(num_col)
        plt.tight_layout()
        plt.show()
    else:
        print('No numeric columns found to plot.')

## 5) Validate Inputs and Add Error Handling

We already log errors on failed reads and check for empty dataframes; below is a quick validator.

In [None]:
def validate_df(df: Optional[pd.DataFrame], min_rows: int = 1) -> bool:
    if df is None:
        return False
    if len(df) < min_rows:
        logging.warning('Dataframe has fewer than %s rows (%s).', min_rows, len(df))
        return False
    return True

## 6) Unit Tests (minimal, in-notebook)

A couple quick assertions to sanity-check helpers.

In [None]:
# Minimal tests
assert validate_df(pd.DataFrame({'a': [1, 2, 3]}))
assert first_numeric_column(pd.DataFrame({'x': ['a', 'b'], 'y': [1, 2]})) == 'y'
print('Helper tests passed.')

## 7) Visualize Results: Transport

Pick a transport CSV, load a sample, summarize, and make a simple plot.

In [None]:
print('Selecting a transport CSV...')
transport_csv = pick_csv(CFG.transport_dir)
print('Found:' , transport_csv)
transport_df = safe_load_csv(transport_csv, nrows=CFG.sample_rows)
if validate_df(transport_df):
    summarize_and_plot(transport_df, title='Transport dataset')
else:
    print('No transport data available; skipping plot.')

## 8) Visualize Results: Air Quality

Prefer AQHI or time series CSV if present; otherwise, pick the first CSV available.

In [None]:
print('Selecting an air quality CSV...')
aq_prefer = ['AQHI', 'aqhi', 'city_dashboard', 'realtime', 'time', 'smart', 'o3']
if CFG.air_quality_dir.exists():
    csvs = sorted(CFG.air_quality_dir.glob('*.csv'))
else:
    csvs = []
choice = None
for p in csvs:
    if any(k in p.name.lower() for k in aq_prefer):
        choice = p
        break
if choice is None:
    choice = csvs[0] if csvs else None
print('Found:', choice)
air_df = safe_load_csv(choice, nrows=CFG.sample_rows)
if validate_df(air_df):
    summarize_and_plot(air_df, title='Air quality dataset')
else:
    print('No air quality data available; skipping plot.')

## 9) Visualize Results: Housing

If no housing files exist yet, this section will skip gracefully.

In [None]:
print('Selecting a housing CSV...')
housing_csv = pick_csv(CFG.housing_dir)
print('Found:' , housing_csv)
housing_df = safe_load_csv(housing_csv, nrows=CFG.sample_rows)
if validate_df(housing_df):
    summarize_and_plot(housing_df, title='Housing dataset')
else:
    print('No housing data available; skipping plot.')

## 10) Save Artifacts and Clean Up

Optionally save small summaries and close figures.

In [None]:
out_dir = ROOT / '_open_data_inventory' / 'samples'
out_dir.mkdir(parents=True, exist_ok=True)

for name, df in [('transport', 'transport_df'), ('air_quality', 'air_df'), ('housing', 'housing_df')]:
    try:
        obj = globals().get(df)
        if isinstance(obj, pd.DataFrame) and not obj.empty:
            sample = obj.head(100)
            fp = out_dir / f'{name}_sample.csv'
            sample.to_csv(fp, index=False)
            print(f'Saved sample: {fp}')
    except Exception as e:
        logging.warning('Could not save sample for %s: %s', name, e)

plt.close('all')
print('Done. Figures closed.')