In [1]:
import os
import pandas as pd
import json
from collections import defaultdict
import numpy as np  # <-- added

# Input/output paths
POPULATION_FOLDER = ''
OUTPUT_FILE = 'lsoa21_population_2022_normalized.json'

# File definitions
POPULATION_FILES = [
    'population_male_aged_0_to_15.csv',
    'population_male_aged_16_to_24.csv',
    'population_male_aged_25_to_49.csv',
    'population_male_aged_50_to_64.csv',
    'population_male_aged_65+.csv',
    'population_female_aged_0_to_15.csv',
    'population_female_aged_16_to_24.csv',
    'population_female_aged_25_to_49.csv',
    'population_female_aged_50_to_64.csv',
    'population_female_aged_65+.csv'
]

AGE_GROUPS = ['0_to_15', '16_to_24', '25_to_49', '50_to_64', '65+']
YEAR = '2022'

# Storage
lsoa_values = defaultdict(lambda: [0] * 18)

# Process each file
for file in POPULATION_FILES:
    file_path = os.path.join(POPULATION_FOLDER, file)
    df = pd.read_csv(file_path)
    df = df[['mnemonic', YEAR]]

    # Parse stratum
    parts = file.replace('.csv', '').split('_')
    sex = parts[1]  # 'male' or 'female'
    age = parts[-1] if '+' in parts[-1] else '_'.join(parts[-3:])
    sex_idx = 1 if sex == 'male' else 2
    age_idx = AGE_GROUPS.index(age)
    stratum_idx = POPULATION_FILES.index(file)

    for _, row in df.iterrows():
        lsoa = row['mnemonic']
        val = row[YEAR]
        if pd.isna(val):
            continue
        val = int(val)

        # Update lv3 (10 strata: index 8–17)
        lsoa_values[lsoa][8 + stratum_idx] += val

        # Update lv1 and lv2:
        lsoa_values[lsoa][0] += val                       # total
        lsoa_values[lsoa][sex_idx] += val                 # male or female
        lsoa_values[lsoa][3 + age_idx] += val             # age group

# ---------------- MIN–MAX NORMALIZATION (added) ----------------
keys = list(lsoa_values.keys())

# Build matrices
mat_lv1 = np.array([[lsoa_values[k][0]] for k in keys], dtype=float)          # shape (N, 1)
mat_lv2 = np.array([lsoa_values[k][:8] for k in keys], dtype=float)           # shape (N, 8)
mat_lv3 = np.array([lsoa_values[k] for k in keys], dtype=float)               # shape (N, 18)

# def minmax(arr):
#     lo = np.min(arr, axis=0)
#     hi = np.max(arr, axis=0)
#     rng = np.where((hi - lo) == 0, 1.0, (hi - lo))
#     return (arr - lo) / rng

# lv1_norm = minmax(mat_lv1)
# lv2_norm = minmax(mat_lv2)
# lv3_norm = minmax(mat_lv3)

def zscore(arr):
    mean = np.nanmean(arr, axis=0)
    std  = np.nanstd(arr, axis=0)
    std  = np.where(std == 0.0, 1.0, std)  # avoid divide-by-zero
    return (arr - mean) / std, mean, std

lv1_norm, lv1_mu, lv1_std = zscore(mat_lv1)
lv2_norm, lv2_mu, lv2_std = zscore(mat_lv2)
lv3_norm, lv3_mu, lv3_std = zscore(mat_lv3)
# ----------------------------------------------------------------

# Create output dict (now using normalized values)
output_json = {}
for i, lsoa in enumerate(keys):
    output_json[lsoa] = {
        'lv1': lv1_norm[i].tolist(),           # total (normalized)
        'lv2': lv2_norm[i].tolist(),           # total+male+female+5 age groups (normalized)
        'lv3': lv3_norm[i].tolist()            # lv2 + 10 strata (normalized)
    }

# Save
with open(OUTPUT_FILE, 'w') as f:
    json.dump(output_json, f, indent=2)

# NEW: save z-score params for the 18 lv3 features (what you asked for)
params = {
    "lv3": {
        "mu": lv3_mu.tolist(),
        "std": lv3_std.tolist(),
        "feature_order": [
            "total_all",
            "male_all",
            "female_all",
            "total_0_15",
            "total_16_24",
            "total_25_49",
            "total_50_64",
            "total_65+",
            "male_0_15",
            "male_16_24",
            "male_25_49",
            "male_50_64",
            "male_65+",
            "female_0_15",
            "female_16_24",
            "female_25_49",
            "female_50_64",
            "female_65+"
        ]
    }
}

OUTPUT_PARAMS_FILE = 'lsoa21_population_2022_normalized_params.json'

with open(OUTPUT_PARAMS_FILE, 'w') as f:
    json.dump(params, f, indent=2)