In [None]:
# IMD 2019 Income & Employment → LSOA 2021 (England & Wales)
# -----------------------------------------------------------
# Goal:
#   Create a JSON of the form:
#       { "LSOA21CD": [income_norm, employment_norm], ... }
#   where income_norm and employment_norm are min–max normalized to [0, 1].
#
# Inputs you need in the working dir:
#   1) The IMD ODS:
#        FILE_IMD_ODS = "2019_Income_and_Employment_Domains_-_England_and_Wales.ods"
#      - Sheet "Income":      use columns "LSOA Code (2011)" + "Income Domain Score"
#      - Sheet "Employment":  use columns "LSOA Code (2011)" + "Employment Domain Score"
#
#   2) LSOA11→LSOA21 exact-fit lookup (England & Wales):
#        LSOA_LOOKUP = "LSOA_(2011)_to_LSOA_(2021)_to_Local_Authority_District_(2022)_Exact_Fit_Lookup_for_EW_(V3).csv"
#      - Must contain columns "LSOA11CD" and "LSOA21CD"
#
#   3) Population for LSOA11 (for many-to-one weighted averaging):
#        POP_LSOA11_JSON = "../population/population_2011_or_2021_LSOA11.json"
#      - Expected structure: { "E01000001": <population_int>, ... }
#      - If you only have a different year for LSOA11 populations, that's fine.
#        We only need *relative* weights within each many-to-one group.
#
# Manual rule (per your note):
#   - For 1→N (one LSOA11 to many LSOA21): replicate the *same score* to all children.
#   - For N→1 (many LSOA11 to one LSOA21): average of *scores*.
#   - There were three many-to-many cases (actually 2↔2) that you manually resolved
#     into one-to-one pairs. We honor those 6 pairs below.
#
# Output:
#   OUT_JSON = "imd2019_income_employment_LSOA21_normalized.json"
# -----------------------------------------------------------

import json
from collections import defaultdict

import numpy as np
import pandas as pd

# ---------- Paths (adjust as needed) ----------
FILE_IMD_ODS = "2019_Income_and_Employment_Domains_-_England_and_Wales.ods"
LSOA_LOOKUP = "../LSOA11_to_LSOA21/LSOA_(2011)_to_LSOA_(2021)_to_Local_Authority_District_(2022)_Exact_Fit_Lookup_for_EW_(V3).csv"
OUT_JSON = "lsoa21_imd2019_income_employment_normalized.json"

# ---------- Columns / sheets ----------
SHEET_INCOME = "Income"
SHEET_EMPLOY = "Employment"
COL_LSOA11 = "LSOA Code (2011)"
COL_INCOME_SCORE = "Income Domain Score"
COL_EMPLOY_SCORE = "Employment Domain Score"

# ---------- Manual 2↔2 resolved to 1↔1 (you provided these) ----------
# Map specific LSOA11 → designated LSOA21
MANUAL_11_TO_21 = {
    "E01008187": "E01035624",
    "E01027506": "E01035637",
    "E01023508": "E01035609",
    "E01023768": "E01035582",
    "E01023964": "E01035608",
    "E01023679": "E01035581",
}

# =========================================================
# 1) Load IMD 2019 scores (LSOA 2011)
# =========================================================
# Use engine="odf" to read .ods (pip install odfpy if needed)
income_df = pd.read_excel(FILE_IMD_ODS, sheet_name=SHEET_INCOME, engine="odf")
employ_df = pd.read_excel(FILE_IMD_ODS, sheet_name=SHEET_EMPLOY, engine="odf")

# Keep only required cols; drop rows with missing IDs
income_df = income_df[[COL_LSOA11, COL_INCOME_SCORE]].dropna(subset=[COL_LSOA11])
employ_df = employ_df[[COL_LSOA11, COL_EMPLOY_SCORE]].dropna(subset=[COL_LSOA11])

# Coerce codes to string, trim
income_df[COL_LSOA11] = income_df[COL_LSOA11].astype(str).str.strip()
employ_df[COL_LSOA11] = employ_df[COL_LSOA11].astype(str).str.strip()

# Merge to have both scores on the same row by LSOA11
scores_11 = pd.merge(income_df, employ_df, on=COL_LSOA11, how="outer", validate="one_to_one")
scores_11 = scores_11.rename(columns={COL_LSOA11: "LSOA11CD",
                                      COL_INCOME_SCORE: "income_score",
                                      COL_EMPLOY_SCORE: "employment_score"})

# Some ODS might have duplicates; if so, average them conservatively
assert scores_11.duplicated("LSOA11CD").any() == False

# Turn into dict for quick access
score_by_11 = {
    row["LSOA11CD"]: (float(row["income_score"]), float(row["employment_score"]))
    for _, row in scores_11.iterrows()
}

# =========================================================
# 2) Load LSOA11 → LSOA21 lookup
# =========================================================
lk = pd.read_csv(LSOA_LOOKUP, dtype={"LSOA11CD": str, "LSOA21CD": str})
lk["LSOA11CD"] = lk["LSOA11CD"].str.strip()
lk["LSOA21CD"] = lk["LSOA21CD"].str.strip()

lsoa11_to_21 = defaultdict(list)
lsoa21_from_11 = defaultdict(list)

for _, r in lk.iterrows():
    l11, l21 = r["LSOA11CD"], r["LSOA21CD"]
    if pd.notna(l11) and pd.notna(l21):
        lsoa11_to_21[l11].append(l21)
        lsoa21_from_11[l21].append(l11)

# =========================================================
# 4) Transform scores to LSOA21 with your rules
#     - Manual 1→1 overrides
#     - 1→1: copy
#     - 1→N: replicate same 11-score to each child 21
#     - N→1: average of 11-scores
# =========================================================
income_21_accum = defaultdict(list)   # for N→1, collect (score, weight)
employ_21_accum = defaultdict(list)

handled_11 = set()

# First, apply manual map so those LSOA11 do not participate in generic flows
for l11, l21 in MANUAL_11_TO_21.items():
    if l11 in score_by_11:
        inc, emp = score_by_11[l11]
        income_21_accum[l21].append((inc, 1.0))  # weight 1; it's effectively a 1→1
        employ_21_accum[l21].append((emp, 1.0))
        handled_11.add(l11)

# Now process all remaining LSOA11
for l11, (inc, emp) in score_by_11.items():
    if l11 in handled_11:
        continue

    children_21 = lsoa11_to_21.get(l11, [])

    if len(children_21) == 0:
        # No mapping found — skip or log
        print(f"[WARN] No LSOA21 mapping for {l11}")
        continue

    if len(children_21) == 1:
        # 1 → 1 (or effectively one target)
        l21 = children_21[0]
        income_21_accum[l21].append((inc, 1.0))
        employ_21_accum[l21].append((emp, 1.0))

    else:
        # 1 → N (replicate same score to each child; equal weight just to take it through)
        for l21 in children_21:
            income_21_accum[l21].append((inc, 1.0))
            employ_21_accum[l21].append((emp, 1.0))

# At this point, we have:
#   - all 1→1 and 1→N pushed into per-LSOA21 lists (with dummy weight 1.0)
# We still need to resolve true N→1 groups:
# In practice, N→1 appears as multiple distinct LSOA11 that ONLY map to a single LSOA21.
# The loops above already placed their scores into that single LSOA21 bin; we must now
# replace the (score, 1.0) placeholders with population-weighted averaging for bins
# that actually contain multiple distinct LSOA11 parents.

def weighted_avg_for_bin(pairs, which_scores=("income", "employment")):
    # pairs is a list of tuples: (score, weight_placeholder)
    # We will recompute weights using LSOA11 populations for N→1 bins.
    # BUT we need to know which LSOA11 contributed; we lost that above.
    # Strategy: we will rebuild these bins properly by traversing lsoa21_from_11.
    # This function unused; see explicit pass below.
    pass  # (kept for clarity; we handle N→1 explicitly right after)

# Re-compute each LSOA21 from its actual parent set to apply pop-weighted avg when needed
income_21_final = {}
employment_21_final = {}

for l21, parents_11 in lsoa21_from_11.items():
    # Pull parents that actually had scores (some 11s may not exist in IMD tables if outside E+W)
    valid_parents = [l11 for l11 in parents_11 if l11 in score_by_11 or l11 in MANUAL_11_TO_21]

    if len(valid_parents) == 0:
        # Maybe this LSOA21 had only 1→N replicated assignments from a different loop (rare).
        # As a fallback, if we already have something accumulated, average them equally.
        if l21 in income_21_accum and len(income_21_accum[l21]) > 0:
            inc_vals = [v for (v, _) in income_21_accum[l21]]
            emp_vals = [v for (v, _) in employ_21_accum[l21]]
            income_21_final[l21] = float(np.mean(inc_vals))
            employment_21_final[l21] = float(np.mean(emp_vals))
        continue

    if len(valid_parents) == 1:
        # Pure 1→1 (or effectively single parent after manual resolution)
        l11 = valid_parents[0]
        inc, emp = score_by_11.get(l11, (None, None))
        if inc is None:
            # Might be a manual override parent; get score via the manual source if needed
            # (but manual mapping already injected as 1→1 above)
            # If still None, fallback to any accumulated
            if l21 in income_21_accum and len(income_21_accum[l21]) > 0:
                income_21_final[l21] = float(np.mean([v for (v, _) in income_21_accum[l21]]))
                employment_21_final[l21] = float(np.mean([v for (v, _) in employ_21_accum[l21]]))
            continue
        income_21_final[l21] = float(inc)
        employment_21_final[l21] = float(emp)

    else:
        # N → 1 : population-weighted average across the LSOA11 parents
        weights = []
        inc_vals = []
        emp_vals = []
        for l11 in valid_parents:
            w = 1
            # If any parent missing population, treat as 0 (it will drop from weights)
            # You can choose to assert here instead.
            inc, emp = score_by_11.get(l11, (None, None))
            if inc is None:
                continue
            weights.append(float(w))
            inc_vals.append(float(inc))
            emp_vals.append(float(emp))

        if len(weights) == 0 or sum(weights) == 0:
            # Fallback: unweighted average
            income_21_final[l21] = float(np.mean(inc_vals)) if inc_vals else None
            employment_21_final[l21] = float(np.mean(emp_vals)) if emp_vals else None
        else:
            w = np.array(weights, dtype=float)
            income_21_final[l21] = float(np.average(np.array(inc_vals, dtype=float), weights=w))
            employment_21_final[l21] = float(np.average(np.array(emp_vals, dtype=float), weights=w))

# There might be LSOA21s that came only from 1→N replication where lsoa21_from_11
# contains parents outside our score_by_11. Fill any missing from the accum bins.
for l21 in set(list(income_21_accum.keys()) + list(employ_21_accum.keys())):
    if l21 not in income_21_final or income_21_final[l21] is None:
        inc_vals = [v for (v, _) in income_21_accum[l21]]
        if inc_vals:
            income_21_final[l21] = float(np.mean(inc_vals))
    if l21 not in employment_21_final or employment_21_final[l21] is None:
        emp_vals = [v for (v, _) in employ_21_accum[l21]]
        if emp_vals:
            employment_21_final[l21] = float(np.mean(emp_vals))

# Drop any LSOA21 without both scores
lsoa21_scores = {
    l21: (income_21_final[l21], employment_21_final[l21])
    for l21 in income_21_final.keys() & employment_21_final.keys()
    if income_21_final[l21] is not None and employment_21_final[l21] is not None
}

# =========================================================
# 5) Min–max normalize to [0, 1] (domain-wise)
# =========================================================
# income_vals = np.array([v[0] for v in lsoa21_scores.values()], dtype=float)
# employ_vals = np.array([v[1] for v in lsoa21_scores.values()], dtype=float)

# inc_min, inc_max = float(np.min(income_vals)), float(np.max(income_vals))
# emp_min, emp_max = float(np.min(employ_vals)), float(np.max(employ_vals))

# def normalize(x, lo, hi):
#     if hi == lo:
#         return 0.0
#     return (float(x) - lo) / (hi - lo)

# lsoa21_norm = {
#     l21: [normalize(inc, inc_min, inc_max), normalize(emp, emp_min, emp_max)]
#     for l21, (inc, emp) in lsoa21_scores.items()
# }

# =========================================================
# 5) Z-score normalize (domain-wise)
# =========================================================
income_vals = np.array([v[0] for v in lsoa21_scores.values()], dtype=float)
employ_vals = np.array([v[1] for v in lsoa21_scores.values()], dtype=float)

inc_mean, inc_std = float(np.nanmean(income_vals)), float(np.nanstd(income_vals))
emp_mean, emp_std = float(np.nanmean(employ_vals)), float(np.nanstd(employ_vals))

def zscore(x, mean, std):
    if std == 0 or not np.isfinite(std):
        return 0.0
    return (float(x) - mean) / std

lsoa21_norm = {
    l21: [zscore(inc, inc_mean, inc_std), zscore(emp, emp_mean, emp_std)]
    for l21, (inc, emp) in lsoa21_scores.items()
}

# =========================================================
# 6) Save
# =========================================================
with open(OUT_JSON, "w") as f:
    json.dump(lsoa21_norm, f, indent=2, ensure_ascii=False)

# (Optional) quick summary print
print(f"Saved {len(lsoa21_norm):,} LSOA21 rows to {OUT_JSON}")

In [1]:
# IMD 2019 Income & Employment → LSOA 2021 (England & Wales)
# RAW (no normalization)

import json
from collections import defaultdict
import numpy as np
import pandas as pd

# ---------- Paths ----------
FILE_IMD_ODS = "2019_Income_and_Employment_Domains_-_England_and_Wales.ods"
LSOA_LOOKUP = "../LSOA11_to_LSOA21/LSOA_(2011)_to_LSOA_(2021)_to_Local_Authority_District_(2022)_Exact_Fit_Lookup_for_EW_(V3).csv"
OUT_JSON = "lsoa21_imd2019_income_employment_raw.json"

# ---------- Sheets / columns ----------
SHEET_INCOME = "Income"
SHEET_EMPLOY = "Employment"
COL_LSOA11 = "LSOA Code (2011)"
COL_INCOME_SCORE = "Income Domain Score"
COL_EMPLOY_SCORE = "Employment Domain Score"

# ---------- Manual 2↔2 resolved to 1↔1 ----------
MANUAL_11_TO_21 = {
    "E01008187": "E01035624",
    "E01027506": "E01035637",
    "E01023508": "E01035609",
    "E01023768": "E01035582",
    "E01023964": "E01035608",
    "E01023679": "E01035581",
}

# =========================================================
# 1) Load IMD scores (LSOA11)
# =========================================================
income_df = pd.read_excel(FILE_IMD_ODS, sheet_name=SHEET_INCOME, engine="odf")
employ_df = pd.read_excel(FILE_IMD_ODS, sheet_name=SHEET_EMPLOY, engine="odf")

income_df = income_df[[COL_LSOA11, COL_INCOME_SCORE]].dropna(subset=[COL_LSOA11])
employ_df = employ_df[[COL_LSOA11, COL_EMPLOY_SCORE]].dropna(subset=[COL_LSOA11])

income_df[COL_LSOA11] = income_df[COL_LSOA11].astype(str).str.strip()
employ_df[COL_LSOA11] = employ_df[COL_LSOA11].astype(str).str.strip()

scores_11 = pd.merge(
    income_df, employ_df, on=COL_LSOA11, how="outer", validate="one_to_one"
).rename(columns={
    COL_LSOA11: "LSOA11CD",
    COL_INCOME_SCORE: "income_score",
    COL_EMPLOY_SCORE: "employment_score"
})

score_by_11 = {
    r["LSOA11CD"]: (float(r["income_score"]), float(r["employment_score"]))
    for _, r in scores_11.iterrows()
}

# =========================================================
# 2) Load LSOA11 → LSOA21 lookup
# =========================================================
lk = pd.read_csv(LSOA_LOOKUP, dtype=str)
lk["LSOA11CD"] = lk["LSOA11CD"].str.strip()
lk["LSOA21CD"] = lk["LSOA21CD"].str.strip()

lsoa11_to_21 = defaultdict(list)
lsoa21_from_11 = defaultdict(list)

for _, r in lk.iterrows():
    if pd.notna(r["LSOA11CD"]) and pd.notna(r["LSOA21CD"]):
        lsoa11_to_21[r["LSOA11CD"]].append(r["LSOA21CD"])
        lsoa21_from_11[r["LSOA21CD"]].append(r["LSOA11CD"])

# =========================================================
# 3) Transform to LSOA21 (RAW scores)
# =========================================================
income_21_accum = defaultdict(list)
employment_21_accum = defaultdict(list)
handled_11 = set()

# --- Manual overrides (force 1→1) ---
for l11, l21 in MANUAL_11_TO_21.items():
    if l11 in score_by_11:
        inc, emp = score_by_11[l11]
        income_21_accum[l21].append(inc)
        employment_21_accum[l21].append(emp)
        handled_11.add(l11)

# --- Generic mapping ---
for l11, (inc, emp) in score_by_11.items():
    if l11 in handled_11:
        continue

    children = lsoa11_to_21.get(l11, [])
    if not children:
        continue

    if len(children) == 1:
        income_21_accum[children[0]].append(inc)
        employment_21_accum[children[0]].append(emp)
    else:
        # 1 → N replication
        for l21 in children:
            income_21_accum[l21].append(inc)
            employment_21_accum[l21].append(emp)

# --- Resolve N → 1 by averaging ---
income_21_final = {}
employment_21_final = {}

for l21 in set(income_21_accum) | set(employment_21_accum):
    inc_vals = income_21_accum.get(l21, [])
    emp_vals = employment_21_accum.get(l21, [])

    if inc_vals and emp_vals:
        income_21_final[l21] = float(np.mean(inc_vals))
        employment_21_final[l21] = float(np.mean(emp_vals))

# =========================================================
# 4) Build RAW output JSON
# =========================================================
lsoa21_raw = {
    l21: [income_21_final[l21], employment_21_final[l21]]
    for l21 in income_21_final.keys() & employment_21_final.keys()
}

# =========================================================
# 5) Save
# =========================================================
with open(OUT_JSON, "w") as f:
    json.dump(lsoa21_raw, f, indent=2, ensure_ascii=False)

print(f"Saved {len(lsoa21_raw):,} LSOA21 rows → {OUT_JSON}")

Saved 35,672 LSOA21 rows → lsoa21_imd2019_income_employment_raw.json
