In [3]:
import json
import numpy as np
import pandas as pd

# -------- INPUT / OUTPUT --------
INPUT_JSON = "lsoa21_features_raw.json"   # or normalized
OUTPUT_CSV = "descriptive_stats_all_features.csv"

# -------- LOAD --------
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    DATA_JSON = json.load(f)

# -------- VARIABLE NAMES (PLAIN LANGUAGE, ORDER EXACT) --------
AGE_GROUPS = ["0–15", "16–24", "25–49", "50–64", "65+"]

# Population lv3 (18): total; male total; female total; 5 age totals; 10 sex-by-age totals
pop_names = (
    ["Population total",
     "Population male total",
     "Population female total"] +
    [f"Population total age {a}" for a in AGE_GROUPS] +
    [f"Population male age {a}" for a in AGE_GROUPS] +
    [f"Population female age {a}" for a in AGE_GROUPS]
)
assert len(pop_names) == 18

# Employment lv3 (57): [total_all, full_all, part_all] + total_sectors + full_sectors + part_sectors
SECTORS_PLAIN = [
    "Agriculture, forestry and fishing",
    "Mining, quarrying and utilities",
    "Manufacturing",
    "Construction",
    "Motor trades",
    "Wholesale",
    "Retail",
    "Transport and storage (including postal)",
    "Accommodation and food services",
    "Information and communication",
    "Financial and insurance",
    "Property",
    "Professional, scientific and technical",
    "Business administration and support services",
    "Public administration and defence",
    "Education",
    "Health",
    "Arts, entertainment, recreation and other services",
]
emp_names = (
    ["Employment total (all)",
     "Employment full time (all)",
     "Employment part time (all)"] +
    [f"Employment total in {s}" for s in SECTORS_PLAIN] +
    [f"Employment full time in {s}" for s in SECTORS_PLAIN] +
    [f"Employment part time in {s}" for s in SECTORS_PLAIN]
)
assert len(emp_names) == 57

# Households lv3 (33): [total, car0, car1] + 10 hh types + 10 (no-car) + 10 (with-car)
HH_TYPES_PLAIN = [
    "One-person household",
    "One-person household age 66+",
    "One-person household other",
    "Single family household",
    "Couple family household",
    "Couple family household with no children",
    "Couple family household with dependent children",
    "Couple family household with all children non-dependent",
    "Lone parent household",
    "Other household type",
]
hh_names = (
    ["Households total",
     "Households with no car or van",
     "Households with a car or van"] +
    [f"Households {h}" for h in HH_TYPES_PLAIN] +
    [f"Households {h} with no car or van" for h in HH_TYPES_PLAIN] +
    [f"Households {h} with a car or van" for h in HH_TYPES_PLAIN]
)
assert len(hh_names) == 33

# Area + pop density (2)
apd_names = ["Area (square kilometres)", "Population density (people per square kilometre)"]

# Land use (4)
land_names = ["Land use commercial area", "Land use industrial area",
              "Land use residential area", "Land use retail area"]

# POI (5)
poi_names = ["POI education count", "POI food count", "POI health count",
             "POI retail count", "POI transport count"]

# IMD (2)
imd_names = ["IMD 2019 income domain score", "IMD 2019 employment domain score"]

var_names = pop_names + emp_names + hh_names + apd_names + land_names + poi_names + imd_names
N_EXPECT = 18 + 57 + 33 + 2 + 4 + 5 + 2
assert len(var_names) == N_EXPECT

# -------- EXTRACT FEATURE MATRIX (EXACT ORDER) --------
rows = []
for rec in DATA_JSON.values():
    parts = []
    parts.extend(float(x) for x in rec["population"]["lv3"])
    parts.extend(float(x) for x in rec["employment"]["lv3"])
    parts.extend(float(x) for x in rec["households"]["lv3"])
    parts.extend(float(x) for x in (rec.get("area_popdensity") or []))
    parts.extend(float(x) for x in (rec.get("land_use") or []))
    parts.extend(float(x) for x in (rec.get("poi") or []))
    parts.extend(float(x) for x in (rec.get("imd") or []))
    rows.append(parts)

X = np.asarray(rows, dtype=float)
if X.shape[1] != len(var_names):
    raise ValueError(f"Feature count mismatch: X has {X.shape[1]} cols, names has {len(var_names)}")

# -------- STATS --------
means   = np.nanmean(X, axis=0)
stds    = np.nanstd(X, axis=0)
mins    = np.nanmin(X, axis=0)
medians = np.nanmedian(X, axis=0)
maxs    = np.nanmax(X, axis=0)

df = pd.DataFrame({
    "variable index": np.arange(1, X.shape[1] + 1),
    "variable name": var_names,
    "mean": means,
    "std": stds,
    "min": mins,
    "median": medians,
    "max": maxs,
})

# -------- ROUNDING RULES (MEAN/STD only for count-based blocks) --------
# Index ranges (0-based, end-exclusive)
i_pop0, i_pop1 = 0, 18
i_emp0, i_emp1 = i_pop1, i_pop1 + 57
i_hh0,  i_hh1  = i_emp1, i_emp1 + 33
i_apd0, i_apd1 = i_hh1,  i_hh1 + 2
i_land0,i_land1= i_apd1, i_apd1 + 4
i_poi0, i_poi1 = i_land1,i_land1 + 5
i_imd0, i_imd1 = i_poi1, i_poi1 + 2

def round_cols(row_idx, cols, ndigits):
    df.loc[row_idx, cols] = (
        df.loc[row_idx, cols]
        .apply(pd.to_numeric, errors="coerce")
        .round(ndigits)
    )

# Population / Employment / Households: round mean/std to 0.1; keep min/median/max as integers
for a, b in [(i_pop0, i_pop1), (i_emp0, i_emp1), (i_hh0, i_hh1)]:
    round_cols(slice(a, b), ["mean", "std"], 1)
    # keep min/median/max integer-like (no rounding)
    df.loc[slice(a, b), ["min", "median", "max"]] = df.loc[slice(a, b), ["min", "median", "max"]].round(0).astype("Int64")

# Area & pop density: area to 0.001, popden to 0.1 (all stats rounded)
# area
round_cols(i_apd0, ["mean", "std", "min", "median", "max"], 3)
# pop density
round_cols(i_apd0 + 1, ["mean", "std", "min", "median", "max"], 1)

# Land use: integer (all stats)
df.loc[slice(i_land0, i_land1), ["mean", "std", "min", "median", "max"]] = (
    df.loc[slice(i_land0, i_land1), ["mean", "std", "min", "median", "max"]].round(0).astype("Int64")
)

# POI: round mean/std to 0.01; keep min/median/max integer
round_cols(slice(i_poi0, i_poi1), ["mean", "std"], 2)
df.loc[slice(i_poi0, i_poi1), ["min", "median", "max"]] = df.loc[slice(i_poi0, i_poi1), ["min", "median", "max"]].round(0).astype("Int64")

# IMD: 0.001 for all stats
round_cols(slice(i_imd0, i_imd1), ["mean", "std", "min", "median", "max"], 3)

# -------- SAVE --------
df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved descriptive stats for {len(df)} variables → {OUTPUT_CSV}")

Saved descriptive stats for 121 variables → descriptive_stats_all_features.csv
