In [5]:
#!/usr/bin/env python3
"""
Downscale LAD population projections to LSOA21 by proportional split using 2022 LSOA baseline strata.

Inputs:
  1) Clean LAD projections CSV (already cleaned):
     lad_code,lad_name,sex,age_group,year,population

     - sex in: total, male, female
     - age_group in: all, 0_15, 16_24, 25_49, 50_64, 65+
     - year in: 2030, 2035, 2040, 2045

  2) LSOA11->LSOA21->LAD22 exact-fit lookup:
     LSOA_(2011)_to_LSOA_(2021)_to_Local_Authority_District_(2022)_Exact_Fit_Lookup_for_EW_(V3).csv
     columns include: LSOA21CD, LAD22CD

  3) 2022 LSOA21 baseline population JSON:
     ../2022/population_LSOA21.json
     structure:
       pop[lsoa21cd]["population_lv3"] -> list length 18 in this order:
         0: total_all
         1: male_all
         2: female_all
         3: total_0_15
         4: total_16_24
         5: total_25_49
         6: total_50_64
         7: total_65p
         8: male_0_15
         9: male_16_24
        10: male_25_49
        11: male_50_64
        12: male_65p
        13: female_0_15
        14: female_16_24
        15: female_25_49
        16: female_50_64
        17: female_65p

Method:
  For each LAD, sex, age_group, year:
    LSOA_share = LSOA_baseline_2022_stratum / sum_LAD_baseline_2022_stratum
    LSOA_projection = LAD_projection * LSOA_share
  If LAD baseline stratum sum is 0 or missing -> allocate 0 (or equally if you want; see FLAG).

Outputs:
  1) LSOA projections long CSV:
     lsoa21cd,lad22cd,sex,age_group,year,population

  2) Optional wide CSV per LSOA (years as columns) if requested (off by default).

Notes:
  - This preserves LAD totals (up to floating + optional rounding).
  - If you need integer outputs, enable ROUNDING and choose a method.
"""

import json
import math
from pathlib import Path

import numpy as np
import pandas as pd


# ----------------------------- PATHS -----------------------------
LAD_CLEAN_CSV = "lad_population_projections_tidy.csv"  # <-- set your cleaned file path
LOOKUP_CSV = "LSOA_(2011)_to_LSOA_(2021)_to_Local_Authority_District_(2022)_Exact_Fit_Lookup_for_EW_(V3).csv"
LSOA_BASELINE_JSON = "../2022/population_2022_LSOA21.json"

OUT_LONG = "lsoa21_population_projections_2030_2035_2040_2045_long.csv"

# ----------------------------- OPTIONS -----------------------------
YEARS_KEEP = {2030, 2035, 2040, 2045}

# If LAD baseline stratum total is 0, choose allocation:
#   "zero"  -> allocate 0 to all LSOAs
#   "equal" -> allocate equally across LSOAs within LAD
ZERO_BASELINE_POLICY = "zero"

# If True, output integer populations per LSOA using largest-remainder within each (lad,sex,age,year)
ROUNDING = False


# ----------------------------- STRATA MAP -----------------------------
# Map (sex, age_group) -> index in population_lv3
# Using your definition (18 strata)
STRATA_INDEX = {
    ("total",  "all"): 0,
    ("male",   "all"): 1,
    ("female", "all"): 2,

    ("total",  "0_15"): 3,
    ("total",  "16_24"): 4,
    ("total",  "25_49"): 5,
    ("total",  "50_64"): 6,
    ("total",  "65+"): 7,

    ("male",   "0_15"): 8,
    ("male",   "16_24"): 9,
    ("male",   "25_49"): 10,
    ("male",   "50_64"): 11,
    ("male",   "65+"): 12,

    ("female", "0_15"): 13,
    ("female", "16_24"): 14,
    ("female", "25_49"): 15,
    ("female", "50_64"): 16,
    ("female", "65+"): 17,
}


def load_baseline_json(path: str) -> dict:
    with open(path, "r") as f:
        return json.load(f)


def safe_get_lv3(pop_dict: dict, lsoa21cd: str):
    v = pop_dict.get(lsoa21cd, None)
    if v is None:
        return None
    lv3 = v.get("population_lv3", None)
    if lv3 is None or len(lv3) != 18:
        return None
    return lv3


def largest_remainder_round(group_df: pd.DataFrame, total_int: int, value_col: str = "population_float"):
    """
    Round a set of non-negative floats to integers that sum to total_int
    using largest remainder.
    """
    vals = group_df[value_col].to_numpy(dtype=float)
    floors = np.floor(vals).astype(int)
    remainder = vals - floors
    need = int(total_int - floors.sum())
    if need < 0:
        # numerical edge: reduce by taking away from smallest remainders
        order = np.argsort(remainder)  # ascending
        idx = order[: (-need)]
        floors[idx] -= 1
        return floors
    if need == 0:
        return floors
    order = np.argsort(-remainder)  # descending
    idx = order[:need]
    floors[idx] += 1
    return floors


def main():
    # ---- Load lookup: LSOA21 -> LAD22 ----
    lookup = pd.read_csv(LOOKUP_CSV, dtype=str, usecols=["LSOA21CD", "LAD22CD"])
    lookup = lookup.dropna(subset=["LSOA21CD", "LAD22CD"]).drop_duplicates()

    # ---- Load baseline JSON and assemble LSOA baseline table ----
    pop = load_baseline_json(LSOA_BASELINE_JSON)

    # Build baseline dataframe: lsoa21cd, lad22cd, sex, age_group, baseline_2022
    # We only keep LSOAs that appear in lookup (so they map to LAD22)
    lsoa_df = lookup[["LSOA21CD", "LAD22CD"]].copy()
    lsoa_df.columns = ["lsoa21cd", "lad22cd"]

    # Pull lv3 arrays
    baseline_rows = []
    missing = 0
    for lsoa21cd, lad22cd in zip(lsoa_df["lsoa21cd"].values, lsoa_df["lad22cd"].values):
        lv3 = safe_get_lv3(pop, lsoa21cd)
        if lv3 is None:
            missing += 1
            continue
        for (sex, age_group), idx in STRATA_INDEX.items():
            baseline_rows.append((lsoa21cd, lad22cd, sex, age_group, float(lv3[idx])))

    baseline = pd.DataFrame(
        baseline_rows, columns=["lsoa21cd", "lad22cd", "sex", "age_group", "baseline_2022"]
    )

    if baseline.empty:
        raise RuntimeError("Baseline table is empty. Check JSON keys (LSOA21CD) and population_lv3 structure.")

    if missing > 0:
        print(f"[WARN] Missing/invalid baseline for {missing} LSOA21 codes (skipped).")

    # ---- Load cleaned LAD projections ----
    lad = pd.read_csv(LAD_CLEAN_CSV, dtype={"lad_code": str, "lad_name": str, "sex": str, "age_group": str, "year": int})
    lad = lad.rename(columns={"lad_code": "lad22cd"})
    lad["sex"] = lad["sex"].str.strip().str.lower()
    lad["age_group"] = lad["age_group"].str.strip().str.lower()
    lad = lad[lad["year"].isin(YEARS_KEEP)].copy()

    # Validate strata exist in map
    bad = lad[~lad.apply(lambda r: (r["sex"], r["age_group"]) in STRATA_INDEX, axis=1)]
    if not bad.empty:
        ex = bad[["sex", "age_group"]].drop_duplicates().head(20)
        raise RuntimeError(f"Found strata not in STRATA_INDEX mapping (showing up to 20):\n{ex}")

    # ---- Compute LAD baseline totals per stratum ----
    lad_base = (
        baseline.groupby(["lad22cd", "sex", "age_group"], as_index=False)["baseline_2022"]
        .sum()
        .rename(columns={"baseline_2022": "lad_baseline_2022"})
    )

    # ---- Join LAD projections with LAD baseline totals ----
    lad2 = lad.merge(lad_base, on=["lad22cd", "sex", "age_group"], how="left")

    # ---- Expand to LSOA by joining baseline shares ----
    # Join baseline rows for matching LAD/stratum
    expanded = lad2.merge(
        baseline,
        on=["lad22cd", "sex", "age_group"],
        how="left",
        validate="many_to_many",
    )

    # Shares
    expanded["lad_baseline_2022"] = expanded["lad_baseline_2022"].fillna(0.0)
    expanded["baseline_2022"] = expanded["baseline_2022"].fillna(0.0)

    # Handle zero-baseline LAD strata
    if ZERO_BASELINE_POLICY == "equal":
        # equal across LSOAs within LAD for that stratum (only if lad_baseline_2022 == 0)
        # need count of LSOAs per (lad,sex,age)
        counts = baseline.groupby(["lad22cd", "sex", "age_group"], as_index=False)["lsoa21cd"].nunique()
        counts = counts.rename(columns={"lsoa21cd": "n_lsoa"})
        expanded = expanded.merge(counts, on=["lad22cd", "sex", "age_group"], how="left")
        expanded["n_lsoa"] = expanded["n_lsoa"].fillna(0).astype(int)

        def share_row(row):
            if row["lad_baseline_2022"] > 0:
                return row["baseline_2022"] / row["lad_baseline_2022"]
            if row["n_lsoa"] > 0:
                return 1.0 / row["n_lsoa"]
            return 0.0

        expanded["share"] = expanded.apply(share_row, axis=1)
    elif ZERO_BASELINE_POLICY == "zero":
        expanded["share"] = np.where(
            expanded["lad_baseline_2022"] > 0,
            expanded["baseline_2022"] / expanded["lad_baseline_2022"],
            0.0
        )
    else:
        raise ValueError("ZERO_BASELINE_POLICY must be 'zero' or 'equal'.")

    # Project
    expanded["population_float"] = expanded["population"].astype(float) * expanded["share"]

    out_cols = ["lsoa21cd", "lad22cd", "sex", "age_group", "year"]

    if ROUNDING:
        # Largest remainder rounding within each (lad,sex,age,year) group to preserve LAD totals as ints
        expanded["_lad_pop_int"] = expanded["population"].round().astype(int)

        def round_group(g):
            total_int = int(g["_lad_pop_int"].iloc[0])
            ints = largest_remainder_round(g, total_int, value_col="population_float")
            g = g.copy()
            g["population"] = ints
            return g

        expanded = expanded.groupby(["lad22cd", "sex", "age_group", "year"], group_keys=False).apply(round_group)
        out = expanded[out_cols + ["population"]]
    else:
        out = expanded[out_cols + ["population_float"]].rename(columns={"population_float": "population"})

    # Write
    out.to_csv(OUT_LONG, index=False)
    print(f"[OK] Wrote: {OUT_LONG}")
    print(f"Rows: {len(out):,}")
    print(f"Unique LSOAs: {out['lsoa21cd'].nunique():,}")
    print(f"Years: {sorted(out['year'].unique().tolist())}")
    print("Done.")


if __name__ == "__main__":
    main()

[OK] Wrote: lsoa21_population_projections_2030_2035_2040_2045_long.csv
Rows: 2,430,360
Unique LSOAs: 33,755
Years: [2030, 2035, 2040, 2045]
Done.
