In [3]:
#!/usr/bin/env python3
"""
Convert LSOA21 long-form projections to per-year JSON with normalized lv3 (18 strata).

Inputs:
  1) Long CSV from proportional split:
     lsoa21cd,lad22cd,sex,age_group,year,population

     sex in: total, male, female
     age_group in: all, 0_15, 16_24, 25_49, 50_64, 65+
     year in: 2030, 2035, 2040, 2045

  2) Normalization params JSON:
     ../2022/lsoa21_population_2022_normalized_params.json
     structure:
       {"lv3": {"mu":[18], "std":[18], "feature_order":[18]}}

Outputs:
  One JSON per year, dict keyed by lsoa21cd:
    {
      "E010....": {"population_lv3": [18 z-scored values]},
      ...
    }

Notes:
  - No lv1/lv2 output. Only lv3 (18 features).
  - Uses feature_order from params to map (sex,age_group)->index.
"""

import json
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------- PATHS -----------------------------
LONG_CSV = "lsoa21_population_projections_2030_2035_2040_2045_long.csv"
PARAMS_JSON = "../2022/lsoa21_population_2022_normalized_params.json"
OUT_DIR = "lsoa21_population_projections_json"  # folder for 4 jsons

YEARS = [2030, 2035, 2040, 2045]

# ----------------------------- MAPPING -----------------------------
# Must match your earlier STRATA_INDEX and params["lv3"]["feature_order"].
STRATA_INDEX = {
    ("total",  "all"): 0,
    ("male",   "all"): 1,
    ("female", "all"): 2,

    ("total",  "0_15"): 3,
    ("total",  "16_24"): 4,
    ("total",  "25_49"): 5,
    ("total",  "50_64"): 6,
    ("total",  "65+"): 7,

    ("male",   "0_15"): 8,
    ("male",   "16_24"): 9,
    ("male",   "25_49"): 10,
    ("male",   "50_64"): 11,
    ("male",   "65+"): 12,

    ("female", "0_15"): 13,
    ("female", "16_24"): 14,
    ("female", "25_49"): 15,
    ("female", "50_64"): 16,
    ("female", "65+"): 17,
}

def main():
    # ---- Load params ----
    with open(PARAMS_JSON, "r") as f:
        params = json.load(f)

    mu = np.asarray(params["lv3"]["mu"], dtype=float)
    std = np.asarray(params["lv3"]["std"], dtype=float)
    std = np.where(std == 0.0, 1.0, std)

    feature_order = params["lv3"].get("feature_order", None)
    if feature_order is None or len(feature_order) != 18:
        raise RuntimeError("params['lv3']['feature_order'] missing or not length 18.")

    # sanity: feature_order aligns with STRATA_INDEX expected order
    expected = [
        "total_all","male_all","female_all",
        "total_0_15","total_16_24","total_25_49","total_50_64","total_65+",
        "male_0_15","male_16_24","male_25_49","male_50_64","male_65+",
        "female_0_15","female_16_24","female_25_49","female_50_64","female_65+"
    ]
    if feature_order != expected:
        raise RuntimeError(
            "feature_order in params does not match expected lv3 order.\n"
            f"Expected: {expected}\nGot:      {feature_order}"
        )

    # ---- Load long CSV ----
    df = pd.read_csv(LONG_CSV, dtype={"lsoa21cd": str, "lad22cd": str, "sex": str, "age_group": str, "year": int})
    df["sex"] = df["sex"].str.strip().str.lower()
    df["age_group"] = df["age_group"].str.strip().str.lower()

    # validate strata
    bad = df[~df.apply(lambda r: (r["sex"], r["age_group"]) in STRATA_INDEX, axis=1)]
    if not bad.empty:
        ex = bad[["sex", "age_group"]].drop_duplicates().head(50)
        raise RuntimeError(f"Found (sex,age_group) not in STRATA_INDEX (showing up to 50):\n{ex}")

    out_dir = Path(OUT_DIR)
    out_dir.mkdir(parents=True, exist_ok=True)

    # ---- Build per-year JSON ----
    for yr in YEARS:
        d = df[df["year"] == yr].copy()
        if d.empty:
            raise RuntimeError(f"No rows found for year {yr} in {LONG_CSV}")

        # pivot to wide [n_lsoa x 18]
        # index = lsoa21cd, columns = (sex,age_group)
        wide = d.pivot_table(
            index="lsoa21cd",
            columns=["sex", "age_group"],
            values="population",
            aggfunc="sum",
            fill_value=0.0
        )

        # Ensure all 18 columns exist (fill missing with 0)
        for (sex, age), idx in STRATA_INDEX.items():
            if (sex, age) not in wide.columns:
                wide[(sex, age)] = 0.0

        # Order columns into lv3 order
        cols_in_order = sorted(STRATA_INDEX.items(), key=lambda kv: kv[1])
        wide = wide[[k for (k, _) in cols_in_order]].astype(float)

        mat = wide.to_numpy(dtype=float)  # shape (N, 18)

        # z-score using 2022 params
        mat_norm = (mat - mu[None, :]) / std[None, :]

        # write json
        lsoas = wide.index.to_list()
        out_json = {
            lsoa: {"population_lv3": mat_norm[i, :].tolist()}
            for i, lsoa in enumerate(lsoas)
        }

        out_path = out_dir / f"population_LSOA21_{yr}_normalized.json"
        with open(out_path, "w") as f:
            json.dump(out_json, f, indent=2)

        print(f"[OK] Wrote {out_path}  (LSOAs={len(lsoas):,})")

if __name__ == "__main__":
    main()

[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2030_normalized.json  (LSOAs=33,755)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2035_normalized.json  (LSOAs=33,755)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2040_normalized.json  (LSOAs=33,755)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2045_normalized.json  (LSOAs=33,755)


In [4]:
#!/usr/bin/env python3
"""
Convert LSOA21 long-form projections to per-year JSON with *raw* lv3 (18 strata).

Inputs:
  1) Long CSV from proportional split:
     lsoa21cd,lad22cd,sex,age_group,year,population

Outputs:
  One JSON per year, dict keyed by lsoa21cd:
    {
      "E010....": {"population_lv3": [18 raw values]},
      ...
    }

Notes:
  - No lv1/lv2 output. Only lv3 (18 features).
  - Uses STRATA_INDEX to map (sex,age_group)->index.
"""

import json
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------- PATHS -----------------------------
LONG_CSV = "lsoa21_population_projections_2030_2035_2040_2045_long.csv"
OUT_DIR = "lsoa21_population_projections_json"  # folder for 4 jsons

YEARS = [2030, 2035, 2040, 2045]

# ----------------------------- MAPPING -----------------------------
STRATA_INDEX = {
    ("total",  "all"): 0,
    ("male",   "all"): 1,
    ("female", "all"): 2,

    ("total",  "0_15"): 3,
    ("total",  "16_24"): 4,
    ("total",  "25_49"): 5,
    ("total",  "50_64"): 6,
    ("total",  "65+"): 7,

    ("male",   "0_15"): 8,
    ("male",   "16_24"): 9,
    ("male",   "25_49"): 10,
    ("male",   "50_64"): 11,
    ("male",   "65+"): 12,

    ("female", "0_15"): 13,
    ("female", "16_24"): 14,
    ("female", "25_49"): 15,
    ("female", "50_64"): 16,
    ("female", "65+"): 17,
}

def main():
    # ---- Load long CSV ----
    df = pd.read_csv(
        LONG_CSV,
        dtype={"lsoa21cd": str, "lad22cd": str, "sex": str, "age_group": str, "year": int}
    )
    df["sex"] = df["sex"].str.strip().str.lower()
    df["age_group"] = df["age_group"].str.strip().str.lower()

    # validate strata
    bad = df[~df.apply(lambda r: (r["sex"], r["age_group"]) in STRATA_INDEX, axis=1)]
    if not bad.empty:
        ex = bad[["sex", "age_group"]].drop_duplicates().head(50)
        raise RuntimeError(f"Found (sex,age_group) not in STRATA_INDEX (showing up to 50):\n{ex}")

    out_dir = Path(OUT_DIR)
    out_dir.mkdir(parents=True, exist_ok=True)

    # ---- Build per-year JSON (RAW) ----
    for yr in YEARS:
        d = df[df["year"] == yr].copy()
        if d.empty:
            raise RuntimeError(f"No rows found for year {yr} in {LONG_CSV}")

        wide = d.pivot_table(
            index="lsoa21cd",
            columns=["sex", "age_group"],
            values="population",
            aggfunc="sum",
            fill_value=0.0
        )

        # Ensure all 18 columns exist (fill missing with 0)
        for key in STRATA_INDEX.keys():
            if key not in wide.columns:
                wide[key] = 0.0

        # Order columns into lv3 order
        cols_in_order = [k for (k, _) in sorted(STRATA_INDEX.items(), key=lambda kv: kv[1])]
        wide = wide[cols_in_order].astype(float)

        mat = wide.to_numpy(dtype=float)  # (N, 18)

        lsoas = wide.index.to_list()
        out_json = {
            lsoa: {"population_lv3": mat[i, :].tolist()}
            for i, lsoa in enumerate(lsoas)
        }

        out_path = out_dir / f"population_LSOA21_{yr}.json"  # <-- no "normalized"
        with open(out_path, "w") as f:
            json.dump(out_json, f, indent=2)

        print(f"[OK] Wrote {out_path}  (LSOAs={len(lsoas):,})")

if __name__ == "__main__":
    main()

[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2030.json  (LSOAs=33,755)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2035.json  (LSOAs=33,755)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2040.json  (LSOAs=33,755)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2045.json  (LSOAs=33,755)
