In [1]:
import geopandas as gpd
import osmium
import shapely.wkb as wkblib
from shapely.geometry import Point
from tqdm import tqdm
import json
import numpy as np

# === Step 1: Read in education POI points ===
class EducationPOIHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.points = []

    def node(self, n):
        try:
            wkb = self.wkb_factory.create_point(n)
            geom = wkblib.loads(wkb, hex=True)
            self.points.append(geom)
        except Exception:
            pass  # skip malformed points

handler = EducationPOIHandler()
handler.apply_file("poi_education.pbf")

# Wrap into GeoDataFrame and project
poi_gdf = gpd.GeoDataFrame(geometry=handler.points, crs="EPSG:4326")
poi_gdf = poi_gdf.to_crs(epsg=27700)

# === Step 2: Read LSOA boundaries ===
lsoa_gdf = gpd.read_file("../boundaries/Lower layer Super Output Areas (December 2021) Boundaries EW BFC (V10)/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFC_V10_8562115581115271145/LSOA_2021_EW_BFC_V10.shp")
lsoa_gdf = lsoa_gdf.to_crs(epsg=27700)

# === Step 3: Spatial join and count ===
joined = gpd.sjoin(poi_gdf, lsoa_gdf, how="left", predicate="within")
count_series = joined["LSOA21CD"].value_counts()

# === Step 4: Store counts for all LSOAs ===
results = {row["LSOA21CD"]: int(count_series.get(row["LSOA21CD"], 0)) for _, row in lsoa_gdf.iterrows()}
counts = list(results.values())

# === Step 5: Output to JSON ===
with open("lsoa_education_poi_count.json", "w") as f:
    json.dump(results, f, indent=2)

# === Step 6: Print summary stats ===
counts_array = np.array(counts)
print(f"Mean education POIs: {counts_array.mean():.2f}")
print(f"Standard deviation:  {counts_array.std():.2f}")
print(f"Max education POIs:  {counts_array.max()}")

Mean education POIs: 13.60
Standard deviation:  22.48
Max education POIs:  1302


In [2]:
import geopandas as gpd
import osmium
import shapely.wkb as wkblib
from shapely.geometry import Point
from tqdm import tqdm
import json
import numpy as np

# === Step 1: Read in food POI points ===
class FoodPOIHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.points = []

    def node(self, n):
        try:
            wkb = self.wkb_factory.create_point(n)
            geom = wkblib.loads(wkb, hex=True)
            self.points.append(geom)
        except Exception:
            pass  # skip malformed points

handler = FoodPOIHandler()
handler.apply_file("poi_food.pbf")

# Wrap into GeoDataFrame and project
poi_gdf = gpd.GeoDataFrame(geometry=handler.points, crs="EPSG:4326")
poi_gdf = poi_gdf.to_crs(epsg=27700)

# === Step 2: Read LSOA boundaries ===
lsoa_gdf = gpd.read_file("../boundaries/Lower layer Super Output Areas (December 2021) Boundaries EW BFC (V10)/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFC_V10_8562115581115271145/LSOA_2021_EW_BFC_V10.shp")
lsoa_gdf = lsoa_gdf.to_crs(epsg=27700)

# === Step 3: Spatial join and count ===
joined = gpd.sjoin(poi_gdf, lsoa_gdf, how="left", predicate="within")
count_series = joined["LSOA21CD"].value_counts()

# === Step 4: Store counts for all LSOAs ===
results = {row["LSOA21CD"]: int(count_series.get(row["LSOA21CD"], 0)) for _, row in lsoa_gdf.iterrows()}
counts = list(results.values())

# === Step 5: Output to JSON ===
with open("lsoa_food_poi_count.json", "w") as f:
    json.dump(results, f, indent=2)

# === Step 6: Print summary stats ===
counts_array = np.array(counts)
print(f"Mean food POIs: {counts_array.mean():.2f}")
print(f"Standard deviation: {counts_array.std():.2f}")
print(f"Max food POIs: {counts_array.max()}")

Mean food POIs: 4.36
Standard deviation: 17.83
Max food POIs: 837


In [3]:
import geopandas as gpd
import osmium
import shapely.wkb as wkblib
from shapely.geometry import Point
from tqdm import tqdm
import json
import numpy as np

# === Step 1: Read in health POI points ===
class HealthPOIHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.points = []

    def node(self, n):
        try:
            wkb = self.wkb_factory.create_point(n)
            geom = wkblib.loads(wkb, hex=True)
            self.points.append(geom)
        except Exception:
            pass  # skip malformed points

handler = HealthPOIHandler()
handler.apply_file("poi_health.pbf")

# Wrap into GeoDataFrame and project
poi_gdf = gpd.GeoDataFrame(geometry=handler.points, crs="EPSG:4326")
poi_gdf = poi_gdf.to_crs(epsg=27700)

# === Step 2: Read LSOA boundaries ===
lsoa_gdf = gpd.read_file("../boundaries/Lower layer Super Output Areas (December 2021) Boundaries EW BFC (V10)/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFC_V10_8562115581115271145/LSOA_2021_EW_BFC_V10.shp")
lsoa_gdf = lsoa_gdf.to_crs(epsg=27700)

# === Step 3: Spatial join and count ===
joined = gpd.sjoin(poi_gdf, lsoa_gdf, how="left", predicate="within")
count_series = joined["LSOA21CD"].value_counts()

# === Step 4: Store counts for all LSOAs ===
results = {row["LSOA21CD"]: int(count_series.get(row["LSOA21CD"], 0)) for _, row in lsoa_gdf.iterrows()}
counts = list(results.values())

# === Step 5: Output to JSON ===
with open("lsoa_health_poi_count.json", "w") as f:
    json.dump(results, f, indent=2)

# === Step 6: Print summary stats ===
counts_array = np.array(counts)
print(f"Mean health POIs: {counts_array.mean():.2f}")
print(f"Standard deviation: {counts_array.std():.2f}")
print(f"Max health POIs: {counts_array.max()}")

Mean health POIs: 1.67
Standard deviation: 7.19
Max health POIs: 278


In [4]:
import geopandas as gpd
import osmium
import shapely.wkb as wkblib
from shapely.geometry import Point
from tqdm import tqdm
import json
import numpy as np

# === Step 1: Read in retail POI points ===
class RetailPOIHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.points = []

    def node(self, n):
        try:
            wkb = self.wkb_factory.create_point(n)
            geom = wkblib.loads(wkb, hex=True)
            self.points.append(geom)
        except Exception:
            pass  # skip malformed points

handler = RetailPOIHandler()
handler.apply_file("poi_retail.pbf")

# Wrap into GeoDataFrame and project
poi_gdf = gpd.GeoDataFrame(geometry=handler.points, crs="EPSG:4326")
poi_gdf = poi_gdf.to_crs(epsg=27700)

# === Step 2: Read LSOA boundaries ===
lsoa_gdf = gpd.read_file("../boundaries/Lower layer Super Output Areas (December 2021) Boundaries EW BFC (V10)/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFC_V10_8562115581115271145/LSOA_2021_EW_BFC_V10.shp")
lsoa_gdf = lsoa_gdf.to_crs(epsg=27700)

# === Step 3: Spatial join and count ===
joined = gpd.sjoin(poi_gdf, lsoa_gdf, how="left", predicate="within")
count_series = joined["LSOA21CD"].value_counts()

# === Step 4: Store counts for all LSOAs ===
results = {row["LSOA21CD"]: int(count_series.get(row["LSOA21CD"], 0)) for _, row in lsoa_gdf.iterrows()}
counts = list(results.values())

# === Step 5: Output to JSON ===
with open("lsoa_retail_poi_count.json", "w") as f:
    json.dump(results, f, indent=2)

# === Step 6: Print summary stats ===
counts_array = np.array(counts)
print(f"Mean retail POIs: {counts_array.mean():.2f}")
print(f"Standard deviation: {counts_array.std():.2f}")
print(f"Max retail POIs: {counts_array.max()}")

Mean retail POIs: 2.75
Standard deviation: 11.21
Max retail POIs: 364


In [6]:
import geopandas as gpd
import pandas as pd
import osmium
import shapely.wkb as wkblib
from shapely.geometry import Point
from tqdm import tqdm
import json
import numpy as np

# === Step 1: Define a reusable POI reader ===
class POIHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.points = []

    def node(self, n):
        try:
            wkb = self.wkb_factory.create_point(n)
            geom = wkblib.loads(wkb, hex=True)
            self.points.append(geom)
        except Exception:
            pass

def load_pois_from_pbf(filepath):
    handler = POIHandler()
    handler.apply_file(filepath)
    gdf = gpd.GeoDataFrame(geometry=handler.points, crs="EPSG:4326")
    return gdf.to_crs(epsg=27700)

# === Step 2: Load POIs from both transport PBF files ===
transport_amenity_gdf = load_pois_from_pbf("poi_transport_amenity.pbf")
transport_railway_gdf = load_pois_from_pbf("poi_transport_railway.pbf")

# Combine both
all_transport_pois = gpd.GeoDataFrame(
    geometry=pd.concat([transport_amenity_gdf.geometry, transport_railway_gdf.geometry], ignore_index=True),
    crs="EPSG:27700"
)

# === Step 3: Read LSOA boundaries ===
lsoa_gdf = gpd.read_file("../boundaries/Lower layer Super Output Areas (December 2021) Boundaries EW BFC (V10)/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFC_V10_8562115581115271145/LSOA_2021_EW_BFC_V10.shp")
lsoa_gdf = lsoa_gdf.to_crs(epsg=27700)

# === Step 4: Spatial join and count ===
joined = gpd.sjoin(all_transport_pois, lsoa_gdf, how="left", predicate="within")
count_series = joined["LSOA21CD"].value_counts()

# === Step 5: Store counts for all LSOAs ===
results = {row["LSOA21CD"]: int(count_series.get(row["LSOA21CD"], 0)) for _, row in lsoa_gdf.iterrows()}
counts = list(results.values())

# === Step 6: Output to JSON ===
with open("lsoa_transport_poi_count.json", "w") as f:
    json.dump(results, f, indent=2)

# === Step 7: Print summary stats ===
counts_array = np.array(counts)
print(f"Mean transport POIs: {counts_array.mean():.2f}")
print(f"Standard deviation:  {counts_array.std():.2f}")
print(f"Max transport POIs:  {counts_array.max()}")

Mean transport POIs: 53.45
Standard deviation:  98.90
Max transport POIs:  4386


In [1]:
import json
import numpy as np

# ---- Inputs / Output ----
EDU_JSON   = "lsoa_education_poi_count.json"
FOOD_JSON  = "lsoa_food_poi_count.json"
HEALTH_JSON= "lsoa_health_poi_count.json"
RETAIL_JSON= "lsoa_retail_poi_count.json"
TRANS_JSON = "lsoa_transport_poi_count.json"
OUTPUT_JSON= "lsoa21_poi_normalized.json"

# ---- Load ----
with open(EDU_JSON, "r") as f:
    edu = json.load(f)
with open(FOOD_JSON, "r") as f:
    food = json.load(f)
with open(HEALTH_JSON, "r") as f:
    health = json.load(f)
with open(RETAIL_JSON, "r") as f:
    retail = json.load(f)
with open(TRANS_JSON, "r") as f:
    trans = json.load(f)

# Common keys (assumes LSOA21 codes)
keys = sorted(set(edu) & set(food) & set(health) & set(retail) & set(trans))

# Matrix in fixed order: [education, food, health, retail, transport]
mat = np.array([
    [float(edu[k]), float(food[k]), float(health[k]), float(retail[k]), float(trans[k])]
    for k in keys
], dtype=float)  # (N, 5)

# # ---- Min–max normalize per feature ----
# mins = np.min(mat, axis=0)
# maxs = np.max(mat, axis=0)
# rng = np.where((maxs - mins) == 0.0, 1.0, (maxs - mins))
# norm = (mat - mins) / rng

# ---- Z-score normalize per feature ----
means = np.nanmean(mat, axis=0)
stds  = np.nanstd(mat, axis=0)
stds  = np.where(stds == 0.0, 1.0, stds)  # avoid divide-by-zero
norm = (mat - means) / stds

# ---- Write out ----
out = {k: norm[i].tolist() for i, k in enumerate(keys)}
with open(OUTPUT_JSON, "w") as f:
    json.dump(out, f, indent=2)

print(f"Saved {len(out)} LSOA21 rows → {OUTPUT_JSON}")

Saved 35672 LSOA21 rows → lsoa21_poi_normalized.json


In [1]:
### Save all 5 POI features WITHOUT normalization
import json
import numpy as np

# ---- Inputs / Output ----
EDU_JSON    = "lsoa_education_poi_count.json"
FOOD_JSON   = "lsoa_food_poi_count.json"
HEALTH_JSON = "lsoa_health_poi_count.json"
RETAIL_JSON = "lsoa_retail_poi_count.json"
TRANS_JSON  = "lsoa_transport_poi_count.json"
OUTPUT_JSON = "lsoa21_poi_raw.json"

# ---- Load ----
with open(EDU_JSON, "r") as f:
    edu = json.load(f)
with open(FOOD_JSON, "r") as f:
    food = json.load(f)
with open(HEALTH_JSON, "r") as f:
    health = json.load(f)
with open(RETAIL_JSON, "r") as f:
    retail = json.load(f)
with open(TRANS_JSON, "r") as f:
    trans = json.load(f)

# Common keys (assumes LSOA21 codes)
keys = sorted(set(edu) & set(food) & set(health) & set(retail) & set(trans))

# ---- Build feature matrix (NO normalization) ----
# Fixed order: [education, food, health, retail, transport]
mat = np.array([
    [
        float(edu[k]),
        float(food[k]),
        float(health[k]),
        float(retail[k]),
        float(trans[k]),
    ]
    for k in keys
], dtype=float)  # (N, 5)

# ---- Write out in EXACT same structure ----
# { "LSOA21CD": [education, food, health, retail, transport], ... }
out = {k: mat[i].tolist() for i, k in enumerate(keys)}

with open(OUTPUT_JSON, "w") as f:
    json.dump(out, f, indent=2)

print(f"Saved {len(out)} LSOA21 rows → {OUTPUT_JSON}")

Saved 35672 LSOA21 rows → lsoa21_poi_raw.json
