In [1]:
# This script merge population, employment, land use, POI and additional features into one json
# All features are raw and not normalized

In [6]:
import json
from pathlib import Path

# -------- RAW feature files (no normalization) --------
feature_files = {
    "population": "population/2022/population_2022_LSOA21_raw.json",
    "employment": "employment/lsoa21_employment_2022_raw.json",
    "land_use": "land_use/lsoa21_land_use_raw.json",
    "poi": "POI/lsoa21_poi_raw.json",
    "area_popdensity": "additional/lsoa21_area_popdensity_2022_raw.json",
    "households": "additional/lsoa21_households_raw.json",
    "imd": "additional/lsoa21_imd2019_income_employment_raw.json",
}

OUTPUT_JSON = "lsoa21_features_raw.json"

# -------- Load all inputs --------
loaded = {}
for name, path in feature_files.items():
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Missing file for '{name}': {path}")
    with p.open("r", encoding="utf-8") as f:
        loaded[name] = json.load(f)

# -------- Determine common LSOA set --------
common_keys = None
for obj in loaded.values():
    keys = set(obj.keys())
    common_keys = keys if common_keys is None else (common_keys & keys)

if not common_keys:
    raise RuntimeError("No common LSOA21CD keys across the provided inputs.")

# -------- Build merged RAW structure --------
merged = {}
for k in sorted(common_keys):
    pop = loaded["population"][k]
    emp = loaded["employment"][k]
    hh  = loaded["households"][k]

    land = loaded["land_use"][k]            # [commercial, industrial, residential, retail]
    poi  = loaded["poi"][k]                 # [education, food, health, retail, transport]
    apd  = loaded["area_popdensity"][k]     # [area_sqkm, popdens_2022]
    imd  = loaded["imd"][k]                 # [income_score, employment_score]

    merged[k] = {
        "population": {
            "lv1": pop.get("population_lv1", []),
            "lv2": pop.get("population_lv2", []),
            "lv3": pop.get("population_lv3", []),
        },
        "employment": {
            "lv1": emp.get("employment_lv1", []),
            "lv2": emp.get("employment_lv2", []),
            "lv3": emp.get("employment_lv3", []),
        },
        "households": {
            "lv1": hh.get("lv1", []),
            "lv2": hh.get("lv2", []),
            "lv3": hh.get("lv3", []),
        },
        "area_popdensity": apd,
        "land_use": land,
        "poi": poi,
        "imd": imd,
    }

# -------- Save --------
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(merged, f, indent=2, ensure_ascii=False)

print(f"Merged {len(merged):,} LSOA21CD entries → {OUTPUT_JSON}")

Merged 35,672 LSOA21CD entries → lsoa21_features_raw.json


In [5]:
# Quick sanity check: ensure each input JSON has exactly 35,672 LSOA21 codes,
# and report any missing/extra keys vs the union/intersection.

import json
from pathlib import Path

EXPECTED = 35672

files = {
    "population":      "population/lsoa21_population_2022_normalized.json",
    "employment":      "employment/lsoa21_employment_2022_normalized.json",
    "land_use":        "land_use/lsoa21_land_use_normalized.json",
    "poi":             "POI/lsoa21_poi_normalized.json",
    "area_popdensity": "additional/lsoa21_area_popdensity_2022_normalized.json",
    "households":      "additional/lsoa21_households_normalized.json",
    "imd":             "additional/lsoa21_imd2019_income_employment_normalized.json",
}

loaded = {}
keys_map = {}

# Load & count
for name, path in files.items():
    p = Path(path)
    if not p.exists():
        print(f"[ERROR] Missing file: {name} -> {path}")
        continue
    with p.open("r", encoding="utf-8") as f:
        obj = json.load(f)
    loaded[name] = obj
    keys = set(obj.keys())
    keys_map[name] = keys
    print(f"{name:16s}: {len(keys):5d} entries  {'OK' if len(keys)==EXPECTED else 'MISMATCH'}")

# Compare key sets
if keys_map:
    # Union & intersection
    all_union = set().union(*keys_map.values())
    all_inter = set.intersection(*keys_map.values()) if len(keys_map) > 1 else next(iter(keys_map.values()))
    print(f"\nUnion size       : {len(all_union)}")
    print(f"Intersection size: {len(all_inter)}")

    # For each file, show deltas vs intersection
    for name, keys in keys_map.items():
        missing_vs_inter = all_inter - keys
        extra_vs_inter   = keys - all_inter
        if missing_vs_inter or extra_vs_inter:
            print(f"\n[name={name}]")
            if missing_vs_inter:
                print(f"  Missing vs intersection: {len(missing_vs_inter)}")
            if extra_vs_inter:
                print(f"  Extra vs intersection  : {len(extra_vs_inter)}")

    # Optionally, list a few sample missing/extras for debugging
    SHOW = 5
    for name, keys in keys_map.items():
        miss = all_union - keys
        if miss:
            print(f"\n[name={name}] Missing {len(miss)} vs union (showing up to {SHOW}):")
            for k in list(miss)[:SHOW]:
                print("  ", k)

population      : 35672 entries  OK
employment      : 35672 entries  OK
land_use        : 35672 entries  OK
poi             : 35672 entries  OK
area_popdensity : 35672 entries  OK
households      : 35672 entries  OK
imd             : 35672 entries  OK

Union size       : 35672
Intersection size: 35672
