In [1]:
#!/usr/bin/env python3
"""
Convert Wales LSOA21 long-form projections to per-year JSON with normalized lv3 (18 strata),
using 2022 normalization params.

Inputs:
  1) Wales long CSV:
     lsoa21cd,country,sex,age_group,year,population
     (country should be "Wales"; lad22cd may or may not exist)

  2) Normalization params JSON:
     ../2022/lsoa21_population_2022_normalized_params.json
     {"lv3": {"mu":[18], "std":[18], "feature_order":[18]}}

Outputs (in OUT_DIR, same folder as England script):
  population_LSOA21_2030_normalized_Wales.json
  population_LSOA21_2035_normalized_Wales.json
  population_LSOA21_2040_normalized_Wales.json
  population_LSOA21_2045_normalized_Wales.json

Each JSON:
  { "W010....": {"population_lv3": [18 z-scored values]}, ... }
"""

import json
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------- PATHS -----------------------------
LONG_CSV_WALES = "lsoa21_population_projections_2030_2035_2040_2045_long_Wales.csv"
PARAMS_JSON = "../2022/lsoa21_population_2022_normalized_params.json"
OUT_DIR = "lsoa21_population_projections_json"  # same folder as England JSONs

YEARS = [2030, 2035, 2040, 2045]

# ----------------------------- MAPPING -----------------------------
STRATA_INDEX = {
    ("total",  "all"): 0,
    ("male",   "all"): 1,
    ("female", "all"): 2,

    ("total",  "0_15"): 3,
    ("total",  "16_24"): 4,
    ("total",  "25_49"): 5,
    ("total",  "50_64"): 6,
    ("total",  "65+"): 7,

    ("male",   "0_15"): 8,
    ("male",   "16_24"): 9,
    ("male",   "25_49"): 10,
    ("male",   "50_64"): 11,
    ("male",   "65+"): 12,

    ("female", "0_15"): 13,
    ("female", "16_24"): 14,
    ("female", "25_49"): 15,
    ("female", "50_64"): 16,
    ("female", "65+"): 17,
}

EXPECTED_FEATURE_ORDER = [
    "total_all","male_all","female_all",
    "total_0_15","total_16_24","total_25_49","total_50_64","total_65+",
    "male_0_15","male_16_24","male_25_49","male_50_64","male_65+",
    "female_0_15","female_16_24","female_25_49","female_50_64","female_65+"
]

def main():
    # ---- Load params ----
    with open(PARAMS_JSON, "r") as f:
        params = json.load(f)

    mu = np.asarray(params["lv3"]["mu"], dtype=float)
    std = np.asarray(params["lv3"]["std"], dtype=float)
    std = np.where(std == 0.0, 1.0, std)

    feature_order = params["lv3"].get("feature_order", None)
    if feature_order is None or len(feature_order) != 18:
        raise RuntimeError("params['lv3']['feature_order'] missing or not length 18.")
    if feature_order != EXPECTED_FEATURE_ORDER:
        raise RuntimeError(
            "feature_order in params does not match expected lv3 order.\n"
            f"Expected: {EXPECTED_FEATURE_ORDER}\nGot:      {feature_order}"
        )

    # ---- Load Wales long CSV ----
    df = pd.read_csv(
        LONG_CSV_WALES,
        dtype={"lsoa21cd": str, "country": str, "sex": str, "age_group": str, "year": int},
    )
    df["sex"] = df["sex"].str.strip().str.lower()
    df["age_group"] = df["age_group"].str.strip().str.lower()

    # Optional: keep only Wales rows if country exists
    if "country" in df.columns:
        df = df[df["country"].str.strip().str.lower() == "wales"].copy()

    # validate strata
    bad = df[~df.apply(lambda r: (r["sex"], r["age_group"]) in STRATA_INDEX, axis=1)]
    if not bad.empty:
        ex = bad[["sex", "age_group"]].drop_duplicates().head(50)
        raise RuntimeError(f"Found (sex,age_group) not in STRATA_INDEX (showing up to 50):\n{ex}")

    out_dir = Path(OUT_DIR)
    out_dir.mkdir(parents=True, exist_ok=True)

    # ---- Build per-year JSON ----
    cols_in_order = [k for (k, _) in sorted(STRATA_INDEX.items(), key=lambda kv: kv[1])]

    for yr in YEARS:
        d = df[df["year"] == yr].copy()
        if d.empty:
            raise RuntimeError(f"No rows found for year {yr} in {LONG_CSV_WALES}")

        wide = d.pivot_table(
            index="lsoa21cd",
            columns=["sex", "age_group"],
            values="population",
            aggfunc="sum",
            fill_value=0.0
        )

        # Ensure all 18 columns exist
        for key in cols_in_order:
            if key not in wide.columns:
                wide[key] = 0.0

        # Order columns into lv3 order
        wide = wide[cols_in_order].astype(float)
        mat = wide.to_numpy(dtype=float)

        # z-score using 2022 params
        mat_norm = (mat - mu[None, :]) / std[None, :]

        # write json
        lsoas = wide.index.to_list()
        out_json = {
            lsoa: {"population_lv3": mat_norm[i, :].tolist()}
            for i, lsoa in enumerate(lsoas)
        }

        out_path = out_dir / f"population_LSOA21_{yr}_normalized_Wales.json"
        with open(out_path, "w") as f:
            json.dump(out_json, f, indent=2)

        print(f"[OK] Wrote {out_path}  (LSOAs={len(lsoas):,})")

if __name__ == "__main__":
    main()

[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2030_normalized_Wales.json  (LSOAs=1,917)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2035_normalized_Wales.json  (LSOAs=1,917)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2040_normalized_Wales.json  (LSOAs=1,917)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2045_normalized_Wales.json  (LSOAs=1,917)


In [1]:
#!/usr/bin/env python3
"""
Convert Wales LSOA21 long-form projections to per-year JSON with RAW lv3 (18 strata).

Inputs:
  1) Wales long CSV:
     lsoa21cd,country,sex,age_group,year,population

  2) Normalization params JSON is NOT used here (raw output).

Outputs:
  population_LSOA21_2030_Wales.json
  population_LSOA21_2035_Wales.json
  population_LSOA21_2040_Wales.json
  population_LSOA21_2045_Wales.json

Each JSON:
  { "W010....": {"population_lv3": [18 raw values]}, ... }
"""

import json
from pathlib import Path
import pandas as pd

# ----------------------------- PATHS -----------------------------
LONG_CSV_WALES = "lsoa21_population_projections_2030_2035_2040_2045_long_Wales.csv"
OUT_DIR = "lsoa21_population_projections_json"

YEARS = [2030, 2035, 2040, 2045]

# ----------------------------- STRATA MAP -----------------------------
STRATA_INDEX = {
    ("total",  "all"): 0,
    ("male",   "all"): 1,
    ("female", "all"): 2,

    ("total",  "0_15"): 3,
    ("total",  "16_24"): 4,
    ("total",  "25_49"): 5,
    ("total",  "50_64"): 6,
    ("total",  "65+"): 7,

    ("male",   "0_15"): 8,
    ("male",   "16_24"): 9,
    ("male",   "25_49"): 10,
    ("male",   "50_64"): 11,
    ("male",   "65+"): 12,

    ("female", "0_15"): 13,
    ("female", "16_24"): 14,
    ("female", "25_49"): 15,
    ("female", "50_64"): 16,
    ("female", "65+"): 17,
}

def main():
    df = pd.read_csv(
        LONG_CSV_WALES,
        dtype={"lsoa21cd": str, "country": str, "sex": str, "age_group": str, "year": int},
    )

    df["sex"] = df["sex"].str.strip().str.lower()
    df["age_group"] = df["age_group"].str.strip().str.lower()

    # Keep Wales only (defensive)
    if "country" in df.columns:
        df = df[df["country"].str.strip().str.lower() == "wales"].copy()

    # Validate strata
    bad = df[~df.apply(lambda r: (r["sex"], r["age_group"]) in STRATA_INDEX, axis=1)]
    if not bad.empty:
        ex = bad[["sex", "age_group"]].drop_duplicates().head(50)
        raise RuntimeError(f"Found invalid (sex,age_group):\n{ex}")

    out_dir = Path(OUT_DIR)
    out_dir.mkdir(parents=True, exist_ok=True)

    # Column order = lv3 order
    cols_in_order = [k for (k, _) in sorted(STRATA_INDEX.items(), key=lambda kv: kv[1])]

    for yr in YEARS:
        d = df[df["year"] == yr].copy()
        if d.empty:
            raise RuntimeError(f"No rows for year {yr}")

        wide = d.pivot_table(
            index="lsoa21cd",
            columns=["sex", "age_group"],
            values="population",
            aggfunc="sum",
            fill_value=0.0
        )

        # Ensure all 18 features exist
        for key in cols_in_order:
            if key not in wide.columns:
                wide[key] = 0.0

        wide = wide[cols_in_order].astype(float)

        out_json = {
            lsoa: {"population_lv3": wide.loc[lsoa].tolist()}
            for lsoa in wide.index
        }

        out_path = out_dir / f"population_LSOA21_{yr}_Wales.json"
        with open(out_path, "w") as f:
            json.dump(out_json, f, indent=2)

        print(f"[OK] Wrote {out_path}  (LSOAs={len(out_json):,})")

if __name__ == "__main__":
    main()

[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2030_Wales.json  (LSOAs=1,917)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2035_Wales.json  (LSOAs=1,917)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2040_Wales.json  (LSOAs=1,917)
[OK] Wrote lsoa21_population_projections_json\population_LSOA21_2045_Wales.json  (LSOAs=1,917)
