In [2]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -------- INPUT --------
INPUT_JSON = "lsoa21_features_raw.json"   # or normalized version
OUT_PDF = "feature_correlation_heatmap.pdf"

# -------- LOAD --------
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    DATA_JSON = json.load(f)

# -------- REBUILD FEATURE MATRIX (SAME ORDER AS BEFORE) --------
rows = []
for rec in DATA_JSON.values():
    parts = []

    # 1) population lv3 (18)
    parts.extend(float(x) for x in rec["population"]["lv3"])

    # 2) employment lv3 (57)
    parts.extend(float(x) for x in rec["employment"]["lv3"])

    # 3) households lv3 (33)
    parts.extend(float(x) for x in rec["households"]["lv3"])

    # 4) area + population density (2)
    parts.extend(float(x) for x in (rec.get("area_popdensity") or []))

    # 5) land use (4)
    parts.extend(float(x) for x in (rec.get("land_use") or []))

    # 6) POI (5)
    parts.extend(float(x) for x in (rec.get("poi") or []))

    # 7) IMD (2)
    parts.extend(float(x) for x in (rec.get("imd") or []))

    rows.append(parts)

X = np.asarray(rows, dtype=float)
n_features = X.shape[1]
print(f"Matrix shape: {X.shape}")

# -------- CORRELATION --------
# Use pandas to handle NaNs cleanly
df = pd.DataFrame(X, columns=[str(i) for i in range(1, n_features + 1)])
corr = df.corr(method="pearson")

# -------- PLOT --------
plt.figure(figsize=(12, 10))
im = plt.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)

plt.colorbar(im, fraction=0.046, pad=0.04, label="Pearson correlation")

# ticks = variable index
ticks = np.arange(n_features)
plt.xticks(ticks, df.columns, fontsize=6, rotation=90)
plt.yticks(ticks, df.columns, fontsize=6)

plt.title("Feature correlation heatmap (indexed order)")
plt.tight_layout()
plt.savefig(OUT_PDF)
plt.close()

print(f"Saved correlation heatmap → {OUT_PDF}")

Matrix shape: (35672, 121)
Saved correlation heatmap → feature_correlation_heatmap.png
