In [None]:
import pandas as pd
import json
import numpy as np

# -------- Paths --------
INPUT_FILE = "RM008_Car or van availability_household composition.csv"
OUTPUT_FILE = "lsoa21_households_normalized.json"

df_all = pd.read_csv(INPUT_FILE, skiprows=8, nrows=35672, dtype=str) 
df_nocar = pd.read_csv(INPUT_FILE, skiprows=8+35672+23, nrows=35672, dtype=str) 
df_withcar = pd.read_csv(INPUT_FILE, skiprows=8+35672+23+35672+24, nrows=35672, dtype=str) 


import numpy as np
import pandas as pd
import json

# --- 0) Standardize headers ---
def std_headers(df: pd.DataFrame) -> pd.DataFrame:
    ren = {}
    if "mnemonic" in df.columns:
        ren["mnemonic"] = "LSOA21CD"
    if "2021 super output area - lower layer" in df.columns:
        ren["2021 super output area - lower layer"] = "name"
    if "Total" not in df.columns:
        raise ValueError("Expected 'Total' column not found.")
    df = df.rename(columns=ren)
    if "LSOA21CD" not in df.columns:
        raise ValueError("Expected 'mnemonic' (LSOA21CD) not found.")
    return df

df_all    = std_headers(df_all)
df_nocar  = std_headers(df_nocar)
df_withcar= std_headers(df_withcar)

# --- 1) Identify composition columns (exclude non-numeric headers) ---
comp_cols = [c for c in df_all.columns if c not in ("LSOA21CD", "name", "Total")]
# sanity check: expect 11 composition columns
if len(comp_cols) != 10:
    raise ValueError(f"Expected 10 composition columns; got {len(comp_cols)} -> {comp_cols}")

# --- 2) Coerce numeric for Total + composition in all three DFs ---
def coerce_numeric_cols(df: pd.DataFrame, comp_cols: list[str]) -> pd.DataFrame:
    cols_to_num = ["Total"] + comp_cols
    for c in cols_to_num:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)
    return df

df_all     = coerce_numeric_cols(df_all, comp_cols)
df_nocar   = coerce_numeric_cols(df_nocar, comp_cols)
df_withcar = coerce_numeric_cols(df_withcar, comp_cols)

# --- 3) Keep only LSOA21CD, Total, comp cols; align column order across blocks ---
keep_cols = ["LSOA21CD", "Total"] + comp_cols
df_all     = df_all[keep_cols].copy()
df_nocar   = df_nocar.reindex(columns=keep_cols).copy()
df_withcar = df_withcar.reindex(columns=keep_cols).copy()

# --- 4) Set index and align on common LSOAs ---
df_all.set_index("LSOA21CD", inplace=True)
df_nocar.set_index("LSOA21CD", inplace=True)
df_withcar.set_index("LSOA21CD", inplace=True)

common = df_all.index.intersection(df_nocar.index).intersection(df_withcar.index)
df_all     = df_all.loc[common]
df_nocar   = df_nocar.loc[common]
df_withcar = df_withcar.loc[common]

# --- 5) Build lv1, lv2, lv3 (RAW) ---
records = {}
for lsoa in df_all.index:
    total = int(df_all.at[lsoa, "Total"])
    car0  = int(df_nocar.at[lsoa, "Total"])
    car1  = int(df_withcar.at[lsoa, "Total"])

    # Composition (overall and per car class) in the exact comp_cols order:
    hh_types      = [int(df_all.at[lsoa, c])     for c in comp_cols]
    cross_car0    = [int(df_nocar.at[lsoa, c])   for c in comp_cols]
    cross_car1    = [int(df_withcar.at[lsoa, c]) for c in comp_cols]

    lv1 = [total]                                    # 1
    lv2 = [total, car0, car1] + hh_types    # 1+2+10=13
    lv3 = lv2 + cross_car0 + cross_car1  # 13+20=33

    records[lsoa] = {"lv1": lv1, "lv2": lv2, "lv3": lv3}

# --- 6) Min–max normalize per level (column-wise) ---
# def normalize_block(arr_like):
#     arr = np.asarray(arr_like, dtype=float)
#     lo = np.nanmin(arr, axis=0)
#     hi = np.nanmax(arr, axis=0)
#     rng = np.where((hi - lo) == 0.0, 1.0, (hi - lo))
#     return (arr - lo) / rng

# --- 6) Z-score normalize per level (column-wise) ---
def normalize_block(arr_like):
    arr = np.asarray(arr_like, dtype=float)
    mean = np.nanmean(arr, axis=0)
    std = np.nanstd(arr, axis=0)
    std = np.where(std == 0.0, 1.0, std)  # avoid divide-by-zero
    return (arr - mean) / std

keys = list(records.keys())
mat_lv1 = [records[k]["lv1"] for k in keys]
mat_lv2 = [records[k]["lv2"] for k in keys]
mat_lv3 = [records[k]["lv3"] for k in keys]

lv1_norm = normalize_block(mat_lv1)
lv2_norm = normalize_block(mat_lv2)
lv3_norm = normalize_block(mat_lv3)

for i, k in enumerate(keys):
    records[k]["lv1"] = lv1_norm[i].tolist()
    records[k]["lv2"] = lv2_norm[i].tolist()
    records[k]["lv3"] = lv3_norm[i].tolist()

# --- 7) Save ---
with open("households_normalized.json", "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"Saved {len(records):,} LSOA21CD entries → households_normalized.json")
print("Lengths:", len(next(iter(records.values()))["lv1"]), len(next(iter(records.values()))["lv2"]), len(next(iter(records.values()))["lv3"]))

In [1]:
import pandas as pd
import json
import numpy as np

# -------- Paths --------
INPUT_FILE  = "RM008_Car or van availability_household composition.csv"
OUTPUT_FILE = "lsoa21_households_raw.json"

# -------- Load three blocks --------
df_all     = pd.read_csv(INPUT_FILE, skiprows=8, nrows=35672, dtype=str)
df_nocar   = pd.read_csv(INPUT_FILE, skiprows=8+35672+23, nrows=35672, dtype=str)
df_withcar = pd.read_csv(INPUT_FILE, skiprows=8+35672+23+35672+24, nrows=35672, dtype=str)

# -------- 0) Standardise headers --------
def std_headers(df: pd.DataFrame) -> pd.DataFrame:
    ren = {}
    if "mnemonic" in df.columns:
        ren["mnemonic"] = "LSOA21CD"
    if "2021 super output area - lower layer" in df.columns:
        ren["2021 super output area - lower layer"] = "name"
    if "Total" not in df.columns:
        raise ValueError("Expected 'Total' column not found.")
    df = df.rename(columns=ren)
    if "LSOA21CD" not in df.columns:
        raise ValueError("Expected 'mnemonic' (LSOA21CD) not found.")
    return df

df_all     = std_headers(df_all)
df_nocar   = std_headers(df_nocar)
df_withcar = std_headers(df_withcar)

# -------- 1) Identify composition columns --------
comp_cols = [c for c in df_all.columns if c not in ("LSOA21CD", "name", "Total")]
if len(comp_cols) != 10:
    raise ValueError(f"Expected 10 composition columns; got {len(comp_cols)}")

# -------- 2) Coerce numeric --------
def coerce_numeric_cols(df: pd.DataFrame, comp_cols):
    for c in ["Total"] + comp_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)
    return df

df_all     = coerce_numeric_cols(df_all, comp_cols)
df_nocar   = coerce_numeric_cols(df_nocar, comp_cols)
df_withcar = coerce_numeric_cols(df_withcar, comp_cols)

# -------- 3) Align columns --------
keep_cols = ["LSOA21CD", "Total"] + comp_cols
df_all     = df_all[keep_cols].copy()
df_nocar   = df_nocar.reindex(columns=keep_cols).copy()
df_withcar = df_withcar.reindex(columns=keep_cols).copy()

# -------- 4) Align on common LSOAs --------
df_all.set_index("LSOA21CD", inplace=True)
df_nocar.set_index("LSOA21CD", inplace=True)
df_withcar.set_index("LSOA21CD", inplace=True)

common = df_all.index.intersection(df_nocar.index).intersection(df_withcar.index)
df_all     = df_all.loc[common]
df_nocar   = df_nocar.loc[common]
df_withcar = df_withcar.loc[common]

# -------- 5) Build RAW lv1 / lv2 / lv3 --------
records = {}

for lsoa in df_all.index:
    total = int(df_all.at[lsoa, "Total"])
    car0  = int(df_nocar.at[lsoa, "Total"])
    car1  = int(df_withcar.at[lsoa, "Total"])

    hh_types   = [int(df_all.at[lsoa, c])     for c in comp_cols]
    cross_car0 = [int(df_nocar.at[lsoa, c])   for c in comp_cols]
    cross_car1 = [int(df_withcar.at[lsoa, c]) for c in comp_cols]

    lv1 = [total]                                   # 1
    lv2 = [total, car0, car1] + hh_types            # 13
    lv3 = lv2 + cross_car0 + cross_car1             # 33

    records[lsoa] = {
        "lv1": lv1,
        "lv2": lv2,
        "lv3": lv3
    }

# -------- 6) Save --------
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"Saved {len(records):,} LSOA21CD entries → {OUTPUT_FILE}")
print(
    "Lengths:",
    len(next(iter(records.values()))["lv1"]),
    len(next(iter(records.values()))["lv2"]),
    len(next(iter(records.values()))["lv3"]),
)

Saved 35,672 LSOA21CD entries → lsoa21_households_raw.json
Lengths: 1 13 33
