In [1]:
import os
import json
import pandas as pd
from collections import defaultdict

# === File paths ===
EMPLOYMENT_TOTAL = 'processed_employment_data/employment_2022_employment.csv'
EMPLOYMENT_FULL = 'processed_employment_data/employment_2022_full_time_employees.csv'
EMPLOYMENT_PART = 'processed_employment_data/employment_2022_part_time_employees.csv'
LSOA_LOOKUP = 'LSOA_(2011)_to_LSOA_(2021)_to_Local_Authority_District_(2022)_Exact_Fit_Lookup_for_EW_(V3).csv'
POPULATION_JSON = '../population/population_2022_LSOA21.json'
OUTPUT_FILE = 'employment_2022_LSOA21.json'

# === Load population 2022 for proportional split ===
with open(POPULATION_JSON, 'r') as f:
    population_data = json.load(f)
pop_2022 = {k: v['population_lv1'][0] for k, v in population_data.items()}

# === Load LSOA lookup table ===
lookup_df = pd.read_csv(LSOA_LOOKUP)
lsoa11_to_21 = defaultdict(list)
lsoa21_from_11 = defaultdict(list)
for _, row in lookup_df.iterrows():
    lsoa11 = row['LSOA11CD']
    lsoa21 = row['LSOA21CD']
    lsoa11_to_21[lsoa11].append(lsoa21)
    lsoa21_from_11[lsoa21].append(lsoa11)

# === Helper to load employment data ===
def load_employment_csv(path):
    df = pd.read_csv(path)
    df = df.drop(columns=[df.columns[0]])  # drop first column
    df = df.rename(columns={df.columns[0]: "mnemonic"})  # rename second column to "mnemonic"
    df = df.set_index("mnemonic")
    return df

total_df = load_employment_csv(EMPLOYMENT_TOTAL)
full_df = load_employment_csv(EMPLOYMENT_FULL)
part_df = load_employment_csv(EMPLOYMENT_PART)
assert total_df.shape[1] == 18  # columns are sectors

# === Collect raw data by LSOA11 ===
raw_employment = {}

for lsoa11 in total_df.index:
    total_sectors = total_df.loc[lsoa11].fillna(0).astype(int).tolist()
    full_sectors = full_df.loc[lsoa11].fillna(0).astype(int).tolist()# if lsoa11 in full_df.index else [0]*18
    part_sectors = part_df.loc[lsoa11].fillna(0).astype(int).tolist()# if lsoa11 in part_df.index else [0]*18

    total_all = sum(total_sectors)
    full_all = sum(full_sectors)
    part_all = sum(part_sectors)

    lv2 = [total_all, full_all, part_all] + total_sectors
    lv3 = lv2 + full_sectors + part_sectors
    raw_employment[lsoa11] = {
        'employment_lv1': [total_all],
        'employment_lv2': lv2,
        'employment_lv3': lv3
    }
    
# === Save result ===
with open('employment_2022_LSOA11.json', 'w') as f:
    json.dump(raw_employment, f, indent=2)

In [2]:
from collections import defaultdict
import json

final_employment = defaultdict(lambda: {'employment_lv1': [0], 'employment_lv2': [0]*21, 'employment_lv3': [0]*57})

# === Manual override for known 2 <-> 2 mappings ===
manual_map = {
    "E01008187": "E01035624",
    "E01027506": "E01035637",
    "E01023508": "E01035609",
    "E01023768": "E01035582",
    "E01023964": "E01035608",
    "E01023679": "E01035581",
}

# === Counters
counter_1to1 = 0
counter_1toN = 0
counter_Nto1 = 0
counter_manual = 0
counter_skipped = 0

used_in_merge = set()

for lsoa11, values in raw_employment.items():
    # Manual override
    if lsoa11 in manual_map:
        lsoa21 = manual_map[lsoa11]
        final_employment[lsoa21] = values
        counter_manual += 1
        continue

    lsoa21_list = lsoa11_to_21.get(lsoa11, [])

    # Exact 1:1 match
    if len(lsoa21_list) == 1 and lsoa21_list[0] == lsoa11:
        lsoa21 = lsoa21_list[0]
        final_employment[lsoa21] = values
        counter_1to1 += 1

    # One-to-many split
    elif len(lsoa21_list) > 1:
        pops = [pop_2022.get(l21, 0) for l21 in lsoa21_list]
        total_pop = sum(pops)

        if total_pop == 0:
            print(f"[WARNING] Skipping {lsoa11} split to {lsoa21_list} — no population data.")
            counter_skipped += 1
            continue

        if any(p == 0 for p in pops):
            print(f"[WARNING] Some LSOA21s missing population in split: {lsoa11} → {lsoa21_list}")

        for l21 in lsoa21_list:
            proportion = pop_2022.get(l21, 0) / total_pop
            final_employment[l21]['employment_lv1'] = [int(round(v * proportion)) for v in values['employment_lv1']]
            final_employment[l21]['employment_lv2'] = [int(round(v * proportion)) for v in values['employment_lv2']]
            final_employment[l21]['employment_lv3'] = [int(round(v * proportion)) for v in values['employment_lv3']]
        counter_1toN += 1

    # Many-to-one merge
    else:
        # Determine if this LSOA11 is part of a many-to-one merge group
        for lsoa21, lsoa11_group in lsoa21_from_11.items():
            if lsoa11 in lsoa11_group:
                if len(lsoa11_group) > 1:
                    used_in_merge.add(lsoa21)
                    final_employment[lsoa21]['employment_lv1'] = [x + y for x, y in zip(final_employment[lsoa21]['employment_lv1'], values['employment_lv1'])]
                    final_employment[lsoa21]['employment_lv2'] = [x + y for x, y in zip(final_employment[lsoa21]['employment_lv2'], values['employment_lv2'])]
                    final_employment[lsoa21]['employment_lv3'] = [x + y for x, y in zip(final_employment[lsoa21]['employment_lv3'], values['employment_lv3'])]
                    counter_Nto1 += 1
                break
        else:
            print(f"[WARNING] {lsoa11} not found in any valid transformation group.")
            counter_skipped += 1

# === Summary ===
print(f"Manual assignments      : {counter_manual}")
print(f"1-to-1 direct copies    : {counter_1to1}")
print(f"1-to-N splits           : {counter_1toN}")
print(f"N-to-1 merges           : {len(used_in_merge)}")
print(f"Unmatched/skipped       : {counter_skipped}")

# === Save to file ===
with open(OUTPUT_FILE, 'w') as f:
    json.dump(final_employment, f, indent=2)

Manual assignments      : 6
1-to-1 direct copies    : 33647
1-to-N splits           : 861
N-to-1 merges           : 119
Unmatched/skipped       : 0


In [1]:
import json
import numpy as np

# ---- Paths ----
INPUT_JSON  = 'lsoa21_employment_2022.json'                 # created by your script
OUTPUT_JSON = 'lsoa21_employment_2022_normalized.json'      # same structure, min–max normalized

# ---- Load ----
with open(INPUT_JSON, 'r') as f:
    data = json.load(f)

# Sanity: infer lengths from first record
first_key = next(iter(data))
lv1_len = len(data[first_key]['employment_lv1'])
lv2_len = len(data[first_key]['employment_lv2'])
lv3_len = len(data[first_key]['employment_lv3'])

# Build matrices (consistent key order)
keys = list(data.keys())
mat_lv1 = np.array([data[k]['employment_lv1'] for k in keys], dtype=float)  # (N, 1)
mat_lv2 = np.array([data[k]['employment_lv2'] for k in keys], dtype=float)  # (N, 21)
mat_lv3 = np.array([data[k]['employment_lv3'] for k in keys], dtype=float)  # (N, 57)

# def minmax(arr: np.ndarray) -> np.ndarray:
#     lo = np.nanmin(arr, axis=0)
#     hi = np.nanmax(arr, axis=0)
#     rng = np.where((hi - lo) == 0.0, 1.0, (hi - lo))
#     return (arr - lo) / rng

# # ---- Normalize per level (column-wise) ----
# norm_lv1 = minmax(mat_lv1)
# norm_lv2 = minmax(mat_lv2)
# norm_lv3 = minmax(mat_lv3)

def zscore(arr: np.ndarray) -> np.ndarray:
    mean = np.nanmean(arr, axis=0)
    std = np.nanstd(arr, axis=0)
    std = np.where(std == 0.0, 1.0, std)  # avoid divide-by-zero
    return (arr - mean) / std

# ---- Normalize per level (column-wise) ----
norm_lv1 = zscore(mat_lv1)
norm_lv2 = zscore(mat_lv2)
norm_lv3 = zscore(mat_lv3)

# ---- Write back with same format ----
out = {}
for i, k in enumerate(keys):
    out[k] = {
        'lv1': norm_lv1[i].tolist(),
        'lv2': norm_lv2[i].tolist(),
        'lv3': norm_lv3[i].tolist(),
    }

with open(OUTPUT_JSON, 'w') as f:
    json.dump(out, f, indent=2)

print(f"Saved normalized JSON with {len(out)} LSOA21 entries → {OUTPUT_JSON}")
print(f"Lengths: lv1={lv1_len}, lv2={lv2_len}, lv3={lv3_len}")

Saved normalized JSON with 35672 LSOA21 entries → lsoa21_employment_2022_normalized.json
Lengths: lv1=1, lv2=21, lv3=57
