In [None]:
import sys
from pathlib import Path
# set the notebook's CWD to your repo root
%cd D:/deepdemand
ROOT = Path.cwd().parents[0]   # go up one level
sys.path.insert(0, str(ROOT))


In [None]:
#!/usr/bin/env python3
"""
Build a globally UNIQUE OD-pair pool (node-level) from all subgraph od_use.feather.

- Enforces integer node IDs for O/D
- Drops NaNs
- Dedupes within each chunk AND against a global on-disk set (via parquet parts)
- Writes final unique pool to a single CSV
- Reports counts and sanity checks

This is streaming: it does NOT try to hold all pairs in RAM.
"""

import os
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm

ROOT = Path("data/subgraphs/subgraphs")
OUT_DIR = Path("interpret/OD_pairs_pool")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# intermediate unique parts
PART_DIR = OUT_DIR / "parts_unique"
PART_DIR.mkdir(parents=True, exist_ok=True)

FINAL_CSV = OUT_DIR / "od_pairs_pool_unique.csv"

# how many raw rows to accumulate before flushing a part
FLUSH_ROWS = 5_000_000

def iter_od_files(root: Path):
    # expects: data/subgraphs/subgraphs/{edge_id}/od_use.feather
    for edge_dir in root.iterdir():
        if not edge_dir.is_dir():
            continue
        f = edge_dir / "od_use.feather"
        if f.is_file():
            yield f

def normalize_od_df(df: pd.DataFrame) -> pd.DataFrame:
    # force numeric -> int64; strip weird float strings etc.
    df = df[["O","D"]].copy()

    # coerce to numeric safely
    df["O"] = pd.to_numeric(df["O"], errors="coerce")
    df["D"] = pd.to_numeric(df["D"], errors="coerce")
    df = df.dropna(subset=["O","D"])

    # convert to int64 (node ids)
    df["O"] = df["O"].astype(np.int64)
    df["D"] = df["D"].astype(np.int64)

    return df

def write_part_unique(df_unique: pd.DataFrame, part_idx: int) -> Path:
    part_path = PART_DIR / f"part_{part_idx:05d}.parquet"
    df_unique.to_parquet(part_path, index=False)
    return part_path

def merge_parts_to_final_csv(part_paths, out_csv: Path):
    # external merge: read each parquet part, concat, drop_duplicates, write csv
    # (still can be large; if too large, do multi-pass merging)
    print("[Merge] Loading parts for final dedupe...")
    dfs = []
    for p in tqdm(part_paths, desc="Read parts"):
        dfs.append(pd.read_parquet(p))
    all_df = pd.concat(dfs, ignore_index=True)
    before = len(all_df)
    all_df = all_df.drop_duplicates(subset=["O","D"], keep="first")
    after = len(all_df)
    print(f"[Merge] concat rows={before:,}, unique rows={after:,}")

    all_df.to_csv(out_csv, index=False)
    print("Wrote:", out_csv)

def main():
    files = list(iter_od_files(ROOT))
    if not files:
        raise RuntimeError(f"No od_use.feather files found under {ROOT}")

    buf = []
    buf_n = 0
    part_idx = 0
    part_paths = []

    total_raw = 0

    for f in tqdm(files, desc="Scan od_use.feather"):
        df = pd.read_feather(f, columns=["O","D"])
        total_raw += len(df)
        df = normalize_od_df(df)

        # local dedupe first
        df = df.drop_duplicates(subset=["O","D"], keep="first")

        buf.append(df)
        buf_n += len(df)

        if buf_n >= FLUSH_ROWS:
            big = pd.concat(buf, ignore_index=True)
            before = len(big)
            big = big.drop_duplicates(subset=["O","D"], keep="first")
            after = len(big)

            part_path = write_part_unique(big, part_idx)
            part_paths.append(part_path)
            print(f"[Flush] part {part_idx} rows before={before:,} after_unique={after:,} -> {part_path.name}")

            part_idx += 1
            buf = []
            buf_n = 0

    # final flush
    if buf_n > 0:
        big = pd.concat(buf, ignore_index=True)
        before = len(big)
        big = big.drop_duplicates(subset=["O","D"], keep="first")
        after = len(big)
        part_path = write_part_unique(big, part_idx)
        part_paths.append(part_path)
        print(f"[Flush] part {part_idx} rows before={before:,} after_unique={after:,} -> {part_path.name}")

    print(f"[Stats] total_raw_rows_read={total_raw:,}")
    print(f"[Stats] parts_written={len(part_paths)}")

    # final merge + global dedupe
    merge_parts_to_final_csv(part_paths, FINAL_CSV)

    # quick sanity: reload and verify uniqueness
    df_final = pd.read_csv(FINAL_CSV)
    dup = df_final.duplicated(subset=["O","D"]).sum()
    print(f"[Sanity] final rows={len(df_final):,} duplicated_pairs={dup}")

if __name__ == "__main__":
    main()

In [None]:
# path setting
import sys
from pathlib import Path
# set the notebook's CWD to your repo root
%cd D:/deepdemand
ROOT = Path.cwd().parents[0]   # go up one level
sys.path.insert(0, str(ROOT))

import importlib
import interpret.od_pair_rf_shap as pipe
importlib.reload(pipe)


In [2]:
def extract_lv1_features(rec):
    feats, names = [], []

    # ---- population (aggregated) ----
    pop_lv3 = rec["population"]["lv3"]
    v = pop_lv3[0]
    feats.append(v)
    names.append("Population")

    # ---- employment (aggregated) ----
    emp_lv3 = rec["employment"]["lv3"]
    v = emp_lv3[0]
    feats.append(v)
    names.append("Employment")

    # ---- households (aggregated) ----
    hh_lv3 = rec["households"]["lv3"]
    v = hh_lv3[0]
    feats.append(v)
    names.append("Households")

    # ---- area only (drop population density) ----
    area = rec["area_popdensity"][0]
    feats.append(area)
    names.append("Area")

    # ---- POI: total count ----
    poi_vals = rec["poi"]
    feats.append(sum(poi_vals))
    names.append("POIs")

    # ---- IMD: mean score ----
    imd_vals = rec["imd"]
    feats.append(sum(imd_vals) / len(imd_vals))
    names.append("Mean IMD")

    return feats, names

# def extract_lv3_features(rec):
    # feats, names = [], []

    # # ---- population lv3: drop first 8, keep remaining (10) ----
    # pop_lv3 = rec["population"]["lv3"]
    # for idx in range(8, len(pop_lv3)):
    #     v = pop_lv3[idx]
    #     feats.append(v)
    #     names.append(f"population_lv3_{idx+1}")

    # # ---- employment lv3: drop first 21, keep remaining ----
    # emp_lv3 = rec["employment"]["lv3"]
    # for idx in range(21, len(emp_lv3)):
    #     v = emp_lv3[idx]
    #     feats.append(v)
    #     names.append(f"employment_lv3_{idx+1}")

    # # ---- households lv3: drop first 13, keep remaining ----
    # hh_lv3 = rec["households"]["lv3"]
    # for idx in range(13, len(hh_lv3)):
    #     v = hh_lv3[idx]
    #     feats.append(v)
    #     names.append(f"households_lv3_{idx+1}")

    # # ---- area_popdensity (2 dims, unchanged) ----
    # for i, v in enumerate(rec["area_popdensity"]):
    #     feats.append(v)
    #     names.append(f"area_popdensity_{i+1}")

    # # ---- land_use (4 dims, unchanged) ----
    # for i, v in enumerate(rec["land_use"]):
    #     feats.append(v)
    #     names.append(f"landuse_{i+1}")

    # # ---- poi (5 dims, unchanged) ----
    # for i, v in enumerate(rec["poi"]):
    #     feats.append(v)
    #     names.append(f"poi_{i+1}")

    # # ---- imd (2 dims, unchanged) ----
    # for i, v in enumerate(rec["imd"]):
    #     feats.append(v)
    #     names.append(f"imd_{i+1}")

    # return feats, names

In [3]:
pipe.extract_features = extract_lv1_features
pipe.SIZE = (5,5)
pipe.X_LABEL = "SHAP value"
pipe.OUT_TAG = "AADT_pairscore_RF_lv1"
pipe.OUT_DIR = pipe.Path("interpret/OD_pair_shap") / pipe.OUT_TAG
pipe.OUT_DIR.mkdir(parents=True, exist_ok=True)
pipe.OUT_SAMPLE_CSV = pipe.OUT_DIR / "od_pair_sample_with_xy.csv"
pipe.OUT_BAR_PDF    = pipe.OUT_DIR / "shap_logOD_pair_score_bar.pdf"
pipe.OUT_BEE_PDF    = pipe.OUT_DIR / "shap_logOD_pair_score_beeswarm.svg"
pipe.main()

[Load] OD pool: interpret/OD_pairs_pool/od_pairs_pool_unique.parquet
[Load] OD pool shape: (107317305, 2)
[OD] Using 50,000 pairs (sampled)
[Load] LSOA JSON: data/node_features/lsoa21_features_normalized.json
[Load] node_to_lsoa: data/node_features/node_to_lsoa.json
[Prep] FULL LSOA matrix via get_lsoa_vector ...
[Prep] FULL matrix: (35672, 121)
[PCA] Projected to 64 dims.
[Prep] FULL->PCA: (35672, 64)
[Prep] RF feature cache (per LSOA) ...


RF cache (LSOA): 100%|██████████| 35672/35672 [00:00<00:00, 641567.06it/s]


[Prep] Mapping node ids -> LSOA indices ...


node->lsoa: 100%|██████████| 50000/50000 [00:00<00:00, 1245117.85it/s]


[OD] Remaining after mapping: 50,000
[RF] X shape: (50000, 12)
[Load] Loaded 18 keys into MukaraLitePair
[Score] DL half-pass scoring ...


pair_score: 100%|██████████| 1/1 [00:00<00:00, 107.27it/s]

[Score] y stats: min=0.430641 max=1591.07 mean=4.56231 nan=0





Saved: interpret\OD_pair_shap\AADT_pairscore_RF_lv1\od_pair_sample_with_xy.csv
[RF] Using 50,000 rows after SUBSAMPLE_N=50000
[RF] Prepared: (50000, 12) (50000,)

[CACHE] Found: interpret\OD_pair_shap\AADT_pairscore_RF_lv1\shap_logOD_pair_score_bar.shap_cache.npz -> skipping RF+SHAP, plotting directly.
Saved: interpret\OD_pair_shap\AADT_pairscore_RF_lv1\shap_logOD_pair_score_bar.pdf
Saved: interpret\OD_pair_shap\AADT_pairscore_RF_lv1\shap_logOD_pair_score_beeswarm.svg
Done.
