In [1]:
#!/usr/bin/env python3
"""
Build final per-year normalized feature JSONs:
- Keep only:
    ["population"]["lv3"]  (18 features)
    ["area_popdensity"]    ([area_norm, popdens_norm])
- Output file per year:
    lsoa21_features_normalized_{year}.json

Inputs:
  1) Area+popdensity normalized projections:
     additional/lsoa21_area_popdensity_projections_json/lsoa21_area_popdensity_{year}_normalized.json
     structure: { "LSOA21CD": [area_norm, popdens_norm], ... }

  2) Population normalized projections:
     population/projections/population_LSOA21_{year}_normalized.json
     structure: { "LSOA21CD": { "population_lv3": [18], ... }, ... }
"""

import json
from pathlib import Path

# ------------------ PATHS ------------------
YEARS = [2030, 2035, 2040, 2045]

AREA_POPDENS_DIR = Path("additional/lsoa21_area_popdensity_projections_json")
POP_NORM_DIR = Path("population/projections")

AREA_POPDENS_FN = "lsoa21_area_popdensity_{year}_normalized.json"
POP_NORM_FN = "population_LSOA21_{year}_normalized.json"

OUT_DIR = Path(".")  # parent dir of json dir => current dir
OUT_FN = "lsoa21_features_normalized_{year}.json"

# If False: require both population and area_popdensity to exist for every LSOA
# If True: include LSOAs even if one side missing (fills missing with None)
ALLOW_MISSING = False


def load_json(path: Path):
    with open(path, "r") as f:
        return json.load(f)


def main():
    for yr in YEARS:
        area_path = AREA_POPDENS_DIR / AREA_POPDENS_FN.format(year=yr)
        pop_path = POP_NORM_DIR / POP_NORM_FN.format(year=yr)
        out_path = OUT_DIR / OUT_FN.format(year=yr)

        area = load_json(area_path)  # {lsoa: [area_norm, popdens_norm]}
        pop = load_json(pop_path)    # {lsoa: {"population_lv3":[18], ...}}

        area_keys = set(area.keys())
        pop_keys = set(pop.keys())

        if ALLOW_MISSING:
            lsoas = sorted(area_keys | pop_keys)
        else:
            lsoas = sorted(area_keys & pop_keys)

        if not lsoas:
            raise RuntimeError(
                f"Year {yr}: 0 LSOAs after key join. "
                f"area={len(area_keys):,} pop={len(pop_keys):,} allow_missing={ALLOW_MISSING}"
            )

        missing_area = 0
        missing_pop = 0
        bad_lv3 = 0
        bad_area = 0

        out = {}
        for lsoa in lsoas:
            area_vec = area.get(lsoa)
            pop_obj = pop.get(lsoa)

            if area_vec is None:
                missing_area += 1
            if pop_obj is None:
                missing_pop += 1

            # Validate population lv3
            lv3 = None
            if pop_obj is not None:
                lv3 = pop_obj.get("population_lv3", None)
                if not (isinstance(lv3, list) and len(lv3) == 18):
                    bad_lv3 += 1
                    lv3 = None

            # Validate area_popdensity
            apd = None
            if area_vec is not None:
                if isinstance(area_vec, list) and len(area_vec) == 2:
                    apd = area_vec
                else:
                    bad_area += 1
                    apd = None

            if not ALLOW_MISSING:
                # In strict mode, skip anything that doesn't have both clean parts
                if lv3 is None or apd is None:
                    continue

            out[lsoa] = {
                "population": {"lv3": lv3},
                "area_popdensity": apd
            }

        # In strict mode, you might end up dropping many if there are bad entries
        if not out:
            raise RuntimeError(
                f"Year {yr}: output empty after validation. "
                f"(bad_lv3={bad_lv3}, bad_area={bad_area})"
            )

        with open(out_path, "w") as f:
            json.dump(out, f, indent=2)

        print(
            f"[OK] {yr}: wrote {out_path} | LSOAs={len(out):,} "
            f"(missing_area={missing_area:,}, missing_pop={missing_pop:,}, "
            f"bad_lv3={bad_lv3:,}, bad_area={bad_area:,})"
        )


if __name__ == "__main__":
    main()

[OK] 2030: wrote lsoa21_features_normalized_2030.json | LSOAs=35,672 (missing_area=0, missing_pop=0, bad_lv3=0, bad_area=0)
[OK] 2035: wrote lsoa21_features_normalized_2035.json | LSOAs=35,672 (missing_area=0, missing_pop=0, bad_lv3=0, bad_area=0)
[OK] 2040: wrote lsoa21_features_normalized_2040.json | LSOAs=35,672 (missing_area=0, missing_pop=0, bad_lv3=0, bad_area=0)
[OK] 2045: wrote lsoa21_features_normalized_2045.json | LSOAs=35,672 (missing_area=0, missing_pop=0, bad_lv3=0, bad_area=0)
