In [None]:
#!/usr/bin/env python3
"""
Build per-year JSON like lsoa21_area_popdensity_2022_normalized.json, but for 2030/2035/2040/2045.

- Feature 0: area_norm (z-score using 2022 area mu/std), derived from raw area JSON (constant across years)
- Feature 1: popdens_norm (z-score using 2022 popdens mu/std), where popdens_raw = pop_total_all_raw / area_raw
  and pop_total_all_raw comes from population projection JSONs (your per-year population_lv3 outputs).

Inputs:
  1) Raw area JSON (NOT normalized):
     { "LSOA21CD": area_sqkm, ... }
  2) Population projection JSONs (raw or normalized are both fine; we use raw totals):
     { "LSOA21CD": {"population_lv3":[18]}, ... }  (lv3[0] = total_all)
  3) Normalization params JSONs (from 2022):
     area params: { "area_sqkm": {"mu":..., "std":...} }  (or similar)
     popdens params: { "popdens": {"mu":..., "std":...} } (or similar)

Output (one per year):
  { "LSOA21CD": [area_norm, popdens_norm], ... }

IMPORTANT:
- Density is computed from RAW values, then normalized using 2022 mu/std.
- We do NOT use any normalized area/popdens as inputs.
"""

import json
from pathlib import Path
import numpy as np

# -------------------- PATHS --------------------
AREA_RAW_JSON = "lsoa21_area_sqkm_2022_raw.json"  # <-- your raw-area json (LSOA21CD -> area)
POP_DIR = Path("../../population/projections")       # <-- folder holding your per-year population JSONs

# Population JSONs must exist (edit names to match yours)
POP_JSON_BY_YEAR = {
    2030: POP_DIR / "population_LSOA21_2030.json",  # or "..._England.json" etc.
    2035: POP_DIR / "population_LSOA21_2035.json",
    2040: POP_DIR / "population_LSOA21_2040.json",
    2045: POP_DIR / "population_LSOA21_2045.json",
}

# 2022 normalization params for z-scoring
AREA_PARAMS_JSON = "lsoa21_area_sqkm_2022_params.json"
POPDENS_PARAMS_JSON = "lsoa21_popdensity_2022_params.json"

OUT_DIR = Path("")
# OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARS = [2030, 2035, 2040, 2045]

# -------------------- HELPERS --------------------
def load_json(path: Path | str):
    with open(path, "r") as f:
        return json.load(f)

def zscore(x: np.ndarray, mu: float, std: float) -> np.ndarray:
    std = 1.0 if std == 0.0 else std
    return (x - mu) / std

def main():
    # ---- load raw area ----
    area_raw = load_json(AREA_RAW_JSON)  # LSOA21CD -> area_sqkm
    # normalize keys + values
    area_raw_clean = {}
    for k, v in area_raw.items():
        if k is None:
            continue
        kk = str(k).strip()
        if kk == "":
            continue
        try:
            vv = v[0]
        except Exception:
            continue
        area_raw_clean[kk] = vv

    if not area_raw_clean:
        raise RuntimeError(f"AREA_RAW_JSON produced 0 usable entries: {AREA_RAW_JSON}")

    # ---- load params ----
    area_params = load_json(AREA_PARAMS_JSON)
    popdens_params = load_json(POPDENS_PARAMS_JSON)

    # Accept a few possible param shapes; pick the first that matches
    def get_mu_std(d, keys):
        for key in keys:
            if key in d and isinstance(d[key], dict) and "mu" in d[key] and "std" in d[key]:
                return float(d[key]["mu"]), float(d[key]["std"])
        # also allow top-level {"mu":..,"std":..}
        if "mu" in d and "std" in d:
            return float(d["mu"]), float(d["std"])
        raise RuntimeError(f"Cannot find mu/std in params json. Keys tried={keys}")

    area_mu, area_std = get_mu_std(area_params, ["area_sqkm"])
    dens_mu, dens_std = get_mu_std(popdens_params, ["popdens_2022"])

    # ---- area_norm is constant across years ----
    lsoas = sorted(area_raw_clean.keys())
    area_vec = np.array([area_raw_clean[x] for x in lsoas], dtype=float)
    area_norm_vec = zscore(area_vec, area_mu, area_std)

    # ---- for each year: build popdens_raw then normalize ----
    for yr in YEARS:
        pop_path = POP_JSON_BY_YEAR.get(yr, None)
        if pop_path is None:
            raise RuntimeError(f"Missing POP_JSON_BY_YEAR entry for year={yr}")
        if not pop_path.exists():
            raise RuntimeError(f"Population JSON not found for year {yr}: {pop_path}")

        popj = load_json(pop_path)

        # get total_all from lv3[0]
        pop_total = np.zeros(len(lsoas), dtype=float)
        missing_pop = 0
        bad_lv3 = 0

        for i, lsoa in enumerate(lsoas):
            rec = popj.get(lsoa, None)
            if rec is None:
                missing_pop += 1
                continue
            lv3 = rec.get("population_lv3", None)
            if not isinstance(lv3, list) or len(lv3) < 1:
                bad_lv3 += 1
                continue
            try:
                pop_total[i] = float(lv3[0])
            except Exception:
                bad_lv3 += 1

        # density raw
        dens_raw = np.zeros_like(pop_total)
        # avoid divide-by-zero (area 0 should basically not exist, but guard anyway)
        nonzero = area_vec > 0
        dens_raw[nonzero] = pop_total[nonzero] / area_vec[nonzero]

        dens_norm = zscore(dens_raw, dens_mu, dens_std)

        out = {
            lsoa: [float(area_norm_vec[i]), float(dens_norm[i])]
            for i, lsoa in enumerate(lsoas)
        }

        out_path = OUT_DIR / f"lsoa21_area_popdensity_{yr}_normalized.json"
        with open(out_path, "w") as f:
            json.dump(out, f, indent=2, ensure_ascii=False)

        print(f"[OK] {yr}: wrote {out_path} (LSOAs={len(lsoas):,}; missing_pop={missing_pop:,}; bad_lv3={bad_lv3:,})")

if __name__ == "__main__":
    main()

[OK] 2030: wrote lsoa21_area_popdensity_projections_json\lsoa21_area_popdensity_2030_normalized.json (LSOAs=35,672; missing_pop=0; bad_lv3=0)
[OK] 2035: wrote lsoa21_area_popdensity_projections_json\lsoa21_area_popdensity_2035_normalized.json (LSOAs=35,672; missing_pop=0; bad_lv3=0)
[OK] 2040: wrote lsoa21_area_popdensity_projections_json\lsoa21_area_popdensity_2040_normalized.json (LSOAs=35,672; missing_pop=0; bad_lv3=0)
[OK] 2045: wrote lsoa21_area_popdensity_projections_json\lsoa21_area_popdensity_2045_normalized.json (LSOAs=35,672; missing_pop=0; bad_lv3=0)
