In [None]:
import sys
from pathlib import Path
# set the notebook's CWD to your repo root
%cd D:/deepdemand
ROOT = Path.cwd().parents[0]   # go up one level
sys.path.insert(0, str(ROOT))


In [None]:
# === Extract O/D embeddings for all LSOAs from a trained DeepDemand checkpoint ===
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from config import MODEL  # uses MODEL['node_hidden'], ['node_out']
from model.dataloader import load_json, get_lsoa_vector

# -------------------
# Config paths
# -------------------
CKPT_PATH   = "param/cv_0/best_stage_1_lr1e-03.pt"   # <- update if needed
LSOA_JSON   = "data/node_features/lsoa21_features_normalized.json"
PCA_MODEL   = "data/node_features/pca_model_lsoa21.npz"       # if you trained with PCA, this should exist
OUT_DIR     = "interpret/node_embeddings"
os.makedirs(OUT_DIR, exist_ok=True)

# -------------------
# Minimal MLP def (must match training MLP)
# -------------------
class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, out_dim, dropout=0.1, act=nn.ReLU):
        super().__init__()
        layers = []
        d = in_dim
        for h in hidden_dims:
            layers += [nn.Linear(d, h), act(), nn.Dropout(dropout)]
            d = h
        layers += [nn.Linear(d, out_dim)]
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

# -------------------
# Helpers
# -------------------
def load_ckpt_state(ckpt_path):
    ckpt = torch.load(ckpt_path, map_location="cpu")
    return ckpt["state_dict"] if isinstance(ckpt, dict) and "state_dict" in ckpt else ckpt

def strip_and_load(mlp: nn.Module, full_sd: dict, prefix: str):
    """
    Load weights into `mlp` by taking items that start with `prefix` (e.g., 'enc_O.')
    and stripping that prefix so they match `mlp`'s keys (which start with 'net.').
    """
    sub = {k[len(prefix):]: v for k, v in full_sd.items() if k.startswith(prefix)}
    missing, unexpected = mlp.load_state_dict(sub, strict=False)
    if missing:
        print(f"[WARN] Missing keys for {prefix}: {missing}")
    if unexpected:
        print(f"[WARN] Unexpected keys for {prefix}: {unexpected}")

def maybe_pca_project(X: np.ndarray, pca_npz_path: str) -> np.ndarray:
    """
    If PCA file exists, project X using saved mean/components.
    Else return X unchanged.
    """
    if not os.path.isfile(pca_npz_path):
        print("[PCA] No PCA model found -> using raw features.")
        return X
    npz = np.load(pca_npz_path, allow_pickle=True)
    mean = npz["mean"]           # (F,)
    comps = npz["components"]    # (k, F)
    Xp = (X - mean) @ comps.T    # (N, k)
    print(f"[PCA] Projected to shape {Xp.shape} using saved model.")
    return Xp.astype(np.float32)

# -------------------
# 1) Build feature matrix X (N_lsoa, F_raw)
# -------------------
lsoa_json = load_json(LSOA_JSON)
lsoa_codes = sorted(lsoa_json.keys())

rows = []
for code in lsoa_codes:
    vec = get_lsoa_vector(lsoa_json[code])  # torch tensor (F_raw,)
    rows.append(vec.cpu().numpy())
X = np.vstack(rows).astype(np.float32)      # (N, F_raw)
print(f"[Data] Raw LSOA feature matrix: {X.shape}")

# Optional PCA (uses saved training PCA if present)
X_in = maybe_pca_project(X, PCA_MODEL)
in_dim = X_in.shape[1]

# -------------------
# 2) Instantiate encoders (same dims as training)
# -------------------
enc_O = MLP(in_dim, MODEL['node_hidden'], MODEL['node_out'], dropout=0.1)
enc_D = MLP(in_dim, MODEL['node_hidden'], MODEL['node_out'], dropout=0.1)

# Load only the encoder weights from checkpoint
full_sd = load_ckpt_state(CKPT_PATH)
strip_and_load(enc_O, full_sd, prefix="enc_O.")
strip_and_load(enc_D, full_sd, prefix="enc_D.")

enc_O.eval()
enc_D.eval()

# -------------------
# 3) Forward pass to get embeddings
# -------------------
with torch.no_grad():
    X_tensor = torch.from_numpy(X_in)              # (N, in_dim)
    E_O = enc_O(X_tensor).cpu().numpy()           # (N, d_out)
    E_D = enc_D(X_tensor).cpu().numpy()           # (N, d_out)

print(f"[Emb] E_O: {E_O.shape}   E_D: {E_D.shape}")

# -------------------
# 4) Save outputs
# -------------------
# Numpy arrays
np.save(os.path.join(OUT_DIR, "E_O.npy"), E_O)
np.save(os.path.join(OUT_DIR, "E_D.npy"), E_D)
np.save(os.path.join(OUT_DIR, "LSOA_codes.npy"), np.array(lsoa_codes, dtype=object))

# Convenient table (first few dims for quick inspection)
dout = E_O.shape[1]
cols_O = [f"eO_{i}" for i in range(dout)]
cols_D = [f"eD_{i}" for i in range(dout)]
df = pd.DataFrame({
    "lsoa_code": lsoa_codes,
    **{c: E_O[:, i] for i, c in enumerate(cols_O)},
    **{c: E_D[:, i] for i, c in enumerate(cols_D)},
})
out_csv = os.path.join(OUT_DIR, "embeddings_O_D.csv")
df.to_csv(out_csv, index=False)
print(f"[Save] Wrote embeddings to:\n  - {OUT_DIR}/E_O.npy\n  - {OUT_DIR}/E_D.npy\n  - {out_csv}")

[Data] Raw LSOA feature matrix: (35672, 121)
[PCA] Projected to shape (35672, 64) using saved model.
[Emb] E_O: (35672, 16)   E_D: (35672, 16)


  ckpt = torch.load(ckpt_path, map_location="cpu")


[Save] Wrote embeddings to:
  - interpret/node_embeddings/E_O.npy
  - interpret/node_embeddings/E_D.npy
  - interpret/node_embeddings\embeddings_O_D.csv


In [7]:
# Visualize O/D embeddings with UMAP, per-figure color scaling
# and a flexible metric config extracted from the LSOA JSON.
# Requirements: pip install umap-learn matplotlib numpy pandas

import os, json
import numpy as np
import matplotlib.pyplot as plt
import umap
import pandas as pd

# ---------- Paths ----------
EMB_DIR     = "node_embeddings"
LSOA_JSON   = "../data/node_features/lsoa21_features_raw.json"
OUT_DIR     = "node_embeddings/umap"
os.makedirs(OUT_DIR, exist_ok=True)

# ---------- Load embeddings ----------
E_O = np.load(os.path.join(EMB_DIR, "E_O.npy"))          # (N, d)
E_D = np.load(os.path.join(EMB_DIR, "E_D.npy"))          # (N, d)
codes = np.load(os.path.join(EMB_DIR, "LSOA_codes.npy"), allow_pickle=True)  # (N,)

# ---------- Load features JSON ----------
with open(LSOA_JSON, "r") as f:
    lsoa_json = json.load(f)

# ---------- Metric config ----------
# Each metric can be:
#   {"name": ..., "num": ("path","to","list_or_scalar", idx_or_None),
#                    "den": ("path","to","list_or_scalar", idx_or_None) or None,
#                    "transform": "log1p"/None,
#                    "desc": "... (for filenames)"}
#
# For list-like leaves provide an integer index; for scalars use None.
# Example below matches your earlier usage (density = total / area).
METRICS = [
    {
        "name": "Households with car ownership",
        "num": ("households","lv3",2),
        "den": None, 
        "transform": None,
        "desc": "Households with car ownership"
    },
]

# ---------- Helpers to extract metrics ----------
def _get_leaf(rec, path_tuple):
    """Fetch a value from nested dict/list: ("key1","key2", index_or_None)."""
    if path_tuple is None:
        return np.nan
    *keys, idx = path_tuple
    cur = rec
    for k in keys:
        if not isinstance(cur, dict) or k not in cur:
            return np.nan
        cur = cur[k]
    if idx is None:
        # scalar leaf
        try:
            return float(cur)
        except Exception:
            return np.nan
    # list-like leaf
    try:
        return float(cur[idx])
    except Exception:
        return np.nan

def compute_metric_vector(codes, metric_spec, lsoa_json):
    vals = np.full(len(codes), np.nan, dtype=float)
    for i, code in enumerate(codes):
        rec = lsoa_json.get(str(code), {})
        num = _get_leaf(rec, metric_spec["num"])
        den = _get_leaf(rec, metric_spec["den"]) if metric_spec.get("den") else 1.0
        if den is None or den == 0:
            vals[i] = np.nan
        else:
            vals[i] = num / den
    if metric_spec.get("transform") == "log1p":
        vals = np.log1p(np.clip(vals, a_min=0, a_max=None))
    return vals

# ---------- Simple per-dim z-score before UMAP ----------
def zscore(X, eps=1e-8):
    mu = X.mean(axis=0, keepdims=True)
    sd = X.std(axis=0, keepdims=True)
    return (X - mu) / (sd + eps)

E_O_std = zscore(E_O)
E_D_std = zscore(E_D)

# ---------- UMAP (same hyperparams for O/D) ----------
reducer_params = dict(
    n_components=2,
    n_neighbors=30,
    min_dist=0.1,
    metric="euclidean",
    random_state=42,
)
U_O = umap.UMAP(**reducer_params).fit_transform(E_O_std)
U_D = umap.UMAP(**reducer_params).fit_transform(E_D_std)

# ---------- Plot helper with per-figure robust color scaling ----------
def scatter_umap(U, color_vals, title, c_label, out_png):
    vals = np.asarray(color_vals, dtype=float)
    # mask NaNs
    mask = ~np.isnan(vals)
    if mask.sum() < 5:
        print(f"[WARN] Too few valid values for color in {title}. Skipping.")
        return

    # robust per-figure scaling: 1st–99th percentiles
    vmin = np.nanpercentile(vals, 1.0)
    vmax = np.nanpercentile(vals, 99.0)
    if not np.isfinite(vmin) or not np.isfinite(vmax) or vmin >= vmax:
        vmin, vmax = np.nanmin(vals), np.nanmax(vals)

    plt.figure(figsize=(4, 3))
    sc = plt.scatter(U[mask, 0], U[mask, 1],
                     c=vals[mask],
                     s=3, alpha=0.25, edgecolor="none",
                     vmin=vmin, vmax=vmax, cmap="turbo")
    cbar = plt.colorbar(sc)
    cbar.set_label(c_label)
    cbar.solids.set_alpha(0.7)
    # plt.title(title)
    plt.xticks([]); plt.yticks([])
    plt.tight_layout()
    plt.savefig(out_png)
    plt.close()
    print(f"Saved: {out_png}")

# ---------- Compute & plot all metrics automatically ----------
records = []
for spec in METRICS:
    name = spec["name"]
    vals = compute_metric_vector(codes, spec, lsoa_json)

    # Save numeric vectors for reference
    np.save(os.path.join(OUT_DIR, f"{name}_values.npy"), vals)

    # Plot O and D with *individual* per-figure scaling
    scatter_umap(U_O, vals,
                 title=f"UMAP (O-encoder) — {name}",
                 c_label=spec.get("desc", name),
                 out_png=os.path.join(OUT_DIR, f"umap_O_{name}.pdf"))
    scatter_umap(U_D, vals,
                 title=f"UMAP (D-encoder) — {name}",
                 c_label=spec.get("desc", name),
                 out_png=os.path.join(OUT_DIR, f"umap_D_{name}.pdf"))

    # For downstream table joins
    records.append(pd.DataFrame({
        "lsoa_code": codes,
        f"{name}": vals
    }))

# ---------- Save UMAP coords and a merged CSV ----------
np.save(os.path.join(OUT_DIR, "U_O.npy"), U_O)
np.save(os.path.join(OUT_DIR, "U_D.npy"), U_D)

# Build two CSVs (O/D) with coords + all metrics
df_metrics = records[0]
for df in records[1:]:
    df_metrics = df_metrics.merge(df, on="lsoa_code", how="outer")

df_O = pd.DataFrame({"lsoa_code": codes, "umap_x": U_O[:,0], "umap_y": U_O[:,1]}).merge(df_metrics, on="lsoa_code", how="left")
df_D = pd.DataFrame({"lsoa_code": codes, "umap_x": U_D[:,0], "umap_y": U_D[:,1]}).merge(df_metrics, on="lsoa_code", how="left")

df_O.to_csv(os.path.join(OUT_DIR, "umap_O_all_metrics.csv"), index=False)
df_D.to_csv(os.path.join(OUT_DIR, "umap_D_all_metrics.csv"), index=False)
print("Done.")

  warn(
  warn(


Saved: node_embeddings/umap\umap_O_Households with car ownership.pdf
Saved: node_embeddings/umap\umap_D_Households with car ownership.pdf
Done.


In [8]:
# --- UMAP recolor by O/D potential (O_score / D_score) ---
# Assumes these already exist in memory from your previous script:
#   - U_O, U_D (Nx2)
#   - codes (N,)
#   - OUT_DIR (string path)
# Also reuses scatter_umap() from your previous script (same styling + robust scaling).

import os
import numpy as np
import pandas as pd

OD_CSV = "OD_scores/O_D_scores.csv"
OUT_DIR = "node_embeddings/umap"

# ---- load O/D scores ----
df_od = pd.read_csv(OD_CSV, dtype={"lsoa_code": str})
df_od["lsoa_code"] = df_od["lsoa_code"].astype(str).str.strip()

# map for fast lookup
o_map = dict(zip(df_od["lsoa_code"].values, df_od["O_score"].astype(float).values))
d_map = dict(zip(df_od["lsoa_code"].values, df_od["D_score"].astype(float).values))

# build vectors aligned to `codes`
codes_str = np.asarray(codes, dtype=str)
o_vals = np.array([o_map.get(c, np.nan) for c in codes_str], dtype=float)
d_vals = np.array([d_map.get(c, np.nan) for c in codes_str], dtype=float)

# optional: save vectors
np.save(os.path.join(OUT_DIR, "O_score_values.npy"), o_vals)
np.save(os.path.join(OUT_DIR, "D_score_values.npy"), d_vals)

# ---- plot: O-embedding colored by O_score; D-embedding colored by D_score ----
scatter_umap(
    U_O, o_vals,
    title="UMAP (O-encoder) — O potential",
    c_label="O potential (O score)",
    out_png=os.path.join(OUT_DIR, "umap_O_colored_by_O_score.pdf")
)

scatter_umap(
    U_D, d_vals,
    title="UMAP (D-encoder) — D potential",
    c_label="O/D potential",
    out_png=os.path.join(OUT_DIR, "umap_D_colored_by_D_score.pdf")
)

print("Done: saved 2 SVGs recolored by O/D scores.")

Saved: node_embeddings/umap\umap_O_colored_by_O_score.pdf
Saved: node_embeddings/umap\umap_D_colored_by_D_score.pdf
Done: saved 2 SVGs recolored by O/D scores.
