In [1]:
# Create JSON for LSOA21 area (sq km, mid-2022 boundaries)
# - Source: Population_density_LSOA21CD_20112022.xlsx
# - Sheet: "Mid-2011 to mid-2022 LSOA 2021"
# - Column names start on the 4th row (header=3), data from the 5th row
# - Use columns: "LSOA 2021 Code", "Area Sq Km"
# - Output JSON format (same structure style as before): { "LSOA21CD": [area_sqkm], ... }
# - Also output params JSON for z-score normalization: {"area_sqkm": {"mu": ..., "std": ...}}

import json
import numpy as np
import pandas as pd

# --------- Paths (adjust as needed) ---------
XLSX_PATH = "Population_density_LSOA21CD_20112022.xlsx"
SHEET_NAME = "Mid-2011 to mid-2022 LSOA 2021"

OUT_JSON_AREA_RAW = "lsoa21_area_sqkm_2022_raw.json"
OUT_JSON_AREA_PARAMS = "lsoa21_area_sqkm_2022_params.json"

# --------- Load Excel (header on 4th row) ---------
df = pd.read_excel(
    XLSX_PATH,
    sheet_name=SHEET_NAME,
    header=3,          # 4th row has the column names
    engine="openpyxl", # xlsx
    dtype={"LSOA 2021 Code": str}
)

# Keep only required columns
req_cols = ["LSOA 2021 Code", "Area Sq Km"]
missing = [c for c in req_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns in sheet: {missing}")

df = df[req_cols].rename(columns={
    "LSOA 2021 Code": "LSOA21CD",
    "Area Sq Km": "area_sqkm",
})

# Clean + coerce
df["LSOA21CD"] = df["LSOA21CD"].astype(str).str.strip()
df = df.dropna(subset=["LSOA21CD"])
df["area_sqkm"] = pd.to_numeric(df["area_sqkm"], errors="coerce")

# Drop rows missing area
df = df.dropna(subset=["area_sqkm"]).copy()

# --------- Compute mu/std (as used by z-score in the original script) ---------
area_arr = df["area_sqkm"].to_numpy(dtype=float)
mu = float(np.nanmean(area_arr))
std = float(np.nanstd(area_arr))
if not np.isfinite(mu) or not np.isfinite(std):
    raise ValueError("Non-finite mean/std encountered for area.")
if std == 0.0:
    # keep std=1.0 to avoid divide-by-zero if someone normalizes later
    std = 1.0

# --------- Build raw JSON: { LSOA21CD: [area_sqkm] } ---------
mapping_raw = {
    row.LSOA21CD: [float(row.area_sqkm)]
    for _, row in df.iterrows()
}

with open(OUT_JSON_AREA_RAW, "w") as f:
    json.dump(mapping_raw, f, indent=2, ensure_ascii=False)

# --------- Build params JSON ---------
params = {
    "area_sqkm": {
        "mu": mu,
        "std": std
    }
}

with open(OUT_JSON_AREA_PARAMS, "w") as f:
    json.dump(params, f, indent=2, ensure_ascii=False)

print(f"Saved {len(mapping_raw):,} LSOA21 entries → {OUT_JSON_AREA_RAW}")
print(f"Saved params → {OUT_JSON_AREA_PARAMS}  (mu={mu:.12f}, std={std:.12f})")

Saved 35,672 LSOA21 entries → lsoa21_area_sqkm_2022_raw.json
Saved params → lsoa21_area_sqkm_2022_params.json  (mu=4.234336176833, std=14.601174334732)


In [2]:
# Save normalization parameters for population density (people / sq km, mid-2022)
# - Source: Population_density_LSOA21CD_20112022.xlsx
# - Sheet: "Mid-2011 to mid-2022 LSOA 2021"
# - Column: "Mid-2022: People per Sq Km"
# - Output JSON:
#     { "popdens_2022": { "mu": ..., "std": ... } }

import json
import numpy as np
import pandas as pd

# --------- Paths ---------
XLSX_PATH = "Population_density_LSOA21CD_20112022.xlsx"
SHEET_NAME = "Mid-2011 to mid-2022 LSOA 2021"
OUT_JSON_POPDENS_PARAMS = "lsoa21_popdensity_2022_params.json"

# --------- Load Excel ---------
df = pd.read_excel(
    XLSX_PATH,
    sheet_name=SHEET_NAME,
    header=3,
    engine="openpyxl",
    dtype={"LSOA 2021 Code": str}
)

req_col = "Mid-2022: People per Sq Km"
if req_col not in df.columns:
    raise ValueError(f"Missing expected column: {req_col}")

df = df[[req_col]].rename(columns={req_col: "popdens_2022"})
df["popdens_2022"] = pd.to_numeric(df["popdens_2022"], errors="coerce")
df = df.dropna(subset=["popdens_2022"]).copy()

# --------- Compute z-score params ---------
arr = df["popdens_2022"].to_numpy(dtype=float)
mu = float(np.nanmean(arr))
std = float(np.nanstd(arr))

if not np.isfinite(mu) or not np.isfinite(std):
    raise ValueError("Non-finite mean/std encountered for population density.")
if std == 0.0:
    std = 1.0  # guard against divide-by-zero downstream

params = {
    "popdens_2022": {
        "mu": mu,
        "std": std
    }
}

with open(OUT_JSON_POPDENS_PARAMS, "w") as f:
    json.dump(params, f, indent=2, ensure_ascii=False)

print(
    f"Saved population density normalization params → {OUT_JSON_POPDENS_PARAMS}\n"
    f"mu={mu:.12f}, std={std:.12f}, N={len(arr):,}"
)

Saved population density normalization params → lsoa21_popdensity_2022_params.json
mu=4442.899100511510, std=4693.280450612376, N=35,672
