In [None]:
# Create JSON for LSOA21 area (sq km) and population density (people/sq km, mid-2022)
# - Source: Population_density_LSOA21CD_20112022.xlsx
# - Sheet: "Mid-2011 to mid-2022 LSOA 2021"
# - Column names start on the 4th row (header=3), data from the 5th row
# - Use columns: "LSOA 2021 Code", "Area Sq Km", "Mid-2022: People per Sq Km"
# - Output JSON format: { "LSOA21CD": [area_norm, pop_density_norm], ... }
# - Both values are z-score normalized across LSOA21s.

import json
import numpy as np
import pandas as pd

# --------- Paths (adjust as needed) ---------
XLSX_PATH = "Population_density_LSOA21CD_20112022.xlsx"
SHEET_NAME = "Mid-2011 to mid-2022 LSOA 2021"
OUT_JSON = "lsoa21_area_popdensity_2022_normalized.json"

# --------- Load Excel (header on 4th row) ---------
df = pd.read_excel(
    XLSX_PATH,
    sheet_name=SHEET_NAME,
    header=3,          # 4th row has the column names
    engine="openpyxl", # xlsx
    dtype={"LSOA 2021 Code": str}
)

# Keep only the required columns; coerce names and strip spaces
req_cols = ["LSOA 2021 Code", "Area Sq Km", "Mid-2022: People per Sq Km"]
missing = [c for c in req_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns in sheet: {missing}")

df = df[req_cols].rename(columns={
    "LSOA 2021 Code": "LSOA21CD",
    "Area Sq Km": "area_sqkm",
    "Mid-2022: People per Sq Km": "popdens_2022"
})

# Drop non-data rows (if any) and ensure types
df["LSOA21CD"] = df["LSOA21CD"].astype(str).str.strip()
df = df.dropna(subset=["LSOA21CD"])
df["area_sqkm"] = pd.to_numeric(df["area_sqkm"], errors="coerce")
df["popdens_2022"] = pd.to_numeric(df["popdens_2022"], errors="coerce")

# Optional: drop rows missing either metric
df = df.dropna(subset=["area_sqkm", "popdens_2022"])

# # --------- Min–max normalization ---------
# def minmax(series: pd.Series) -> pd.Series:
#     arr = series.to_numpy(dtype=float)
#     lo = np.nanmin(arr)
#     hi = np.nanmax(arr)
#     if not np.isfinite(lo) or not np.isfinite(hi):
#         raise ValueError("Non-finite min/max encountered during normalization.")
#     if hi == lo:
#         # Degenerate case: all values identical -> set all to 0.0
#         return pd.Series(np.zeros_like(arr, dtype=float), index=series.index)
#     return (series - lo) / (hi - lo)

# df["area_norm"] = minmax(df["area_sqkm"])
# df["popdens_norm"] = minmax(df["popdens_2022"])

# --------- Z-score normalization ---------
def zscore(series: pd.Series) -> pd.Series:
    arr = series.to_numpy(dtype=float)
    mean = np.nanmean(arr)
    std = np.nanstd(arr)
    if not np.isfinite(mean) or not np.isfinite(std):
        raise ValueError("Non-finite mean/std encountered during normalization.")
    if std == 0:
        return pd.Series(np.zeros_like(arr, dtype=float), index=series.index)
    return (series - mean) / std

df["area_norm"] = zscore(df["area_sqkm"])
df["popdens_norm"] = zscore(df["popdens_2022"])

# --------- Build JSON: { LSOA21CD: [area_norm, popdens_norm] } ---------
mapping = {
    row.LSOA21CD: [float(row.area_norm), float(row.popdens_norm)]
    for _, row in df.iterrows()
}

with open(OUT_JSON, "w") as f:
    json.dump(mapping, f, indent=2, ensure_ascii=False)

# (Optional) quick sanity prints
print(f"Saved {len(mapping):,} LSOA21 entries → {OUT_JSON}")
print(f"Area (sq km): min={df['area_sqkm'].min():.6f}, max={df['area_sqkm'].max():.6f}")
print(f"Pop density (2022): min={df['popdens_2022'].min():.6f}, max={df['popdens_2022'].max():.6f}")

Saved 35,672 LSOA21 entries → lsoa21_area_popdensity_2022_normalized.json
Area (sq km): min=0.009700, max=673.526100
Pop density (2022): min=2.231539, max=117113.402062


In [1]:
# Create JSON for LSOA21 area (sq km) and population density (people/sq km, mid-2022)
# - Source: Population_density_LSOA21CD_20112022.xlsx
# - Sheet: "Mid-2011 to mid-2022 LSOA 2021"
# - Column names start on the 4th row (header=3), data from the 5th row
# - Use columns: "LSOA 2021 Code", "Area Sq Km", "Mid-2022: People per Sq Km"
# - Output JSON format: { "LSOA21CD": [area_sqkm, popdens_2022], ... }
# - NO normalization.

import json
import pandas as pd

# --------- Paths (adjust as needed) ---------
XLSX_PATH = "Population_density_LSOA21CD_20112022.xlsx"
SHEET_NAME = "Mid-2011 to mid-2022 LSOA 2021"
OUT_JSON = "lsoa21_area_popdensity_2022_raw.json"

# --------- Load Excel (header on 4th row) ---------
df = pd.read_excel(
    XLSX_PATH,
    sheet_name=SHEET_NAME,
    header=3,          # 4th row has the column names
    engine="openpyxl", # xlsx
    dtype={"LSOA 2021 Code": str}
)

req_cols = ["LSOA 2021 Code", "Area Sq Km", "Mid-2022: People per Sq Km"]
missing = [c for c in req_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns in sheet: {missing}")

df = df[req_cols].rename(columns={
    "LSOA 2021 Code": "LSOA21CD",
    "Area Sq Km": "area_sqkm",
    "Mid-2022: People per Sq Km": "popdens_2022"
})

# --------- Clean + coerce types ---------
df["LSOA21CD"] = df["LSOA21CD"].astype(str).str.strip()
df = df.dropna(subset=["LSOA21CD"])

df["area_sqkm"] = pd.to_numeric(df["area_sqkm"], errors="coerce")
df["popdens_2022"] = pd.to_numeric(df["popdens_2022"], errors="coerce")

# Optional: drop rows missing either metric
df = df.dropna(subset=["area_sqkm", "popdens_2022"])

# --------- Build JSON: { LSOA21CD: [area_sqkm, popdens_2022] } ---------
mapping = {
    row.LSOA21CD: [float(row.area_sqkm), float(row.popdens_2022)]
    for _, row in df.iterrows()
}

with open(OUT_JSON, "w") as f:
    json.dump(mapping, f, indent=2, ensure_ascii=False)

print(f"Saved {len(mapping):,} LSOA21 entries → {OUT_JSON}")
print(f"Area (sq km): min={df['area_sqkm'].min():.6f}, max={df['area_sqkm'].max():.6f}")
print(f"Pop density (2022): min={df['popdens_2022'].min():.6f}, max={df['popdens_2022'].max():.6f}")

Saved 35,672 LSOA21 entries → lsoa21_area_popdensity_2022_raw.json
Area (sq km): min=0.009700, max=673.526100
Pop density (2022): min=2.231539, max=117113.402062
