In [1]:
import geopandas as gpd
import osmium
import shapely.wkb as wkblib
from shapely.geometry import Polygon, MultiPolygon
from shapely.ops import unary_union
from tqdm import tqdm
import json
import numpy as np

# === Step 1: Read in commercial landuse from .pbf ===
class CommercialAreaHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.commercial_polygons = []

    def area(self, a):
        try:
            tags = {t.k: t.v for t in a.tags}
            if tags.get("landuse") == "commercial":
                wkb = self.wkb_factory.create_multipolygon(a)
                geom = wkblib.loads(wkb, hex=True)
                if isinstance(geom, (Polygon, MultiPolygon)):
                    self.commercial_polygons.append(geom)
        except Exception:
            pass  # skip invalid geometries

handler = CommercialAreaHandler()
handler.apply_file("landuse_commercial.pbf")

# Wrap into GeoDataFrame and project
commercial_gdf = gpd.GeoDataFrame(geometry=handler.commercial_polygons, crs="EPSG:4326")
commercial_gdf = commercial_gdf.to_crs(epsg=27700)

# === Step 2: Union all commercial polygons to avoid overcounting ===
commercial_union = unary_union(commercial_gdf.geometry)

# === Step 3: Read LSOA boundaries ===
lsoa_gdf = gpd.read_file("../boundaries/Lower layer Super Output Areas (December 2021) Boundaries EW BFC (V10)/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFC_V10_8562115581115271145/LSOA_2021_EW_BFC_V10.shp")
lsoa_gdf = lsoa_gdf.to_crs(epsg=27700)

# === Step 4: Calculate commercial area per LSOA ===
results = {}
areas = []

for _, row in tqdm(lsoa_gdf.iterrows(), total=len(lsoa_gdf)):
    lsoa_code = row["LSOA21CD"]
    lsoa_geom = row.geometry

    inter = lsoa_geom.intersection(commercial_union)
    area = inter.area if not inter.is_empty else 0.0

    results[lsoa_code] = area
    areas.append(area)

# === Step 5: Output to JSON ===
with open("lsoa_commercial_area.json", "w") as f:
    json.dump(results, f, indent=2)

# === Step 6: Print summary stats ===
areas_array = np.array(areas)
print(f"Mean commercial area: {areas_array.mean():,.2f} m²")
print(f"Standard deviation:   {areas_array.std():,.2f} m²")
print(f"Max commercial area:  {areas_array.max():,.2f} m²")

100%|██████████| 35672/35672 [11:13<00:00, 52.97it/s] 

Mean commercial area: 7,950.66 m²
Standard deviation:   40,941.96 m²
Max commercial area:  2,300,406.47 m²





In [1]:
import geopandas as gpd
import osmium
import shapely.wkb as wkblib
from shapely.geometry import Polygon, MultiPolygon
from shapely.ops import unary_union
from tqdm import tqdm
import json
import numpy as np

# === Step 1: Read in industrial landuse from .pbf ===
class IndustrialAreaHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.industrial_polygons = []

    def area(self, a):
        try:
            tags = {t.k: t.v for t in a.tags}
            if tags.get("landuse") == "industrial":
                wkb = self.wkb_factory.create_multipolygon(a)
                geom = wkblib.loads(wkb, hex=True)
                if isinstance(geom, (Polygon, MultiPolygon)):
                    self.industrial_polygons.append(geom)
        except Exception:
            pass  # skip invalid geometries

handler = IndustrialAreaHandler()
handler.apply_file("landuse_industrial.pbf")

# Wrap into GeoDataFrame and project
industrial_gdf = gpd.GeoDataFrame(geometry=handler.industrial_polygons, crs="EPSG:4326")
industrial_gdf = industrial_gdf.to_crs(epsg=27700)

# === Step 2: Union all industrial polygons to avoid overcounting ===
industrial_union = unary_union(industrial_gdf.geometry)

# === Step 3: Read LSOA boundaries ===
lsoa_gdf = gpd.read_file("../boundaries/Lower layer Super Output Areas (December 2021) Boundaries EW BFC (V10)/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFC_V10_8562115581115271145/LSOA_2021_EW_BFC_V10.shp")
lsoa_gdf = lsoa_gdf.to_crs(epsg=27700)

# === Step 4: Calculate industrial area per LSOA ===
results = {}
areas = []

for _, row in tqdm(lsoa_gdf.iterrows(), total=len(lsoa_gdf)):
    lsoa_code = row["LSOA21CD"]
    lsoa_geom = row.geometry

    inter = lsoa_geom.intersection(industrial_union)
    area = inter.area if not inter.is_empty else 0.0

    results[lsoa_code] = area
    areas.append(area)

# === Step 5: Output to JSON ===
with open("lsoa_industrial_area.json", "w") as f:
    json.dump(results, f, indent=2)

# === Step 6: Print summary stats ===
areas_array = np.array(areas)
print(f"Mean industrial area: {areas_array.mean():,.2f} m²")
print(f"Standard deviation:   {areas_array.std():,.2f} m²")
print(f"Max industrial area:  {areas_array.max():,.2f} m²")

100%|██████████| 35672/35672 [21:41<00:00, 27.41it/s]


Mean industrial area: 35,094.83 m²
Standard deviation:   169,530.95 m²
Max industrial area:  7,814,080.76 m²


In [2]:
import geopandas as gpd
import osmium
import shapely.wkb as wkblib
from shapely.geometry import Polygon, MultiPolygon
from shapely.ops import unary_union
from tqdm import tqdm
import json
import numpy as np

# === Step 1: Read in residential landuse from .pbf ===
class ResidentialAreaHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.residential_polygons = []

    def area(self, a):
        try:
            tags = {t.k: t.v for t in a.tags}
            if tags.get("landuse") == "residential":
                wkb = self.wkb_factory.create_multipolygon(a)
                geom = wkblib.loads(wkb, hex=True)
                if isinstance(geom, (Polygon, MultiPolygon)):
                    self.residential_polygons.append(geom)
        except Exception:
            pass  # skip invalid geometries

handler = ResidentialAreaHandler()
handler.apply_file("landuse_residential.pbf")

# Wrap into GeoDataFrame and project
residential_gdf = gpd.GeoDataFrame(geometry=handler.residential_polygons, crs="EPSG:4326")
residential_gdf = residential_gdf.to_crs(epsg=27700)

# === Step 2: Union all residential polygons to avoid overcounting ===
residential_union = unary_union(residential_gdf.geometry)

# === Step 3: Read LSOA boundaries ===
lsoa_gdf = gpd.read_file("../boundaries/Lower layer Super Output Areas (December 2021) Boundaries EW BFC (V10)/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFC_V10_8562115581115271145/LSOA_2021_EW_BFC_V10.shp")
lsoa_gdf = lsoa_gdf.to_crs(epsg=27700)

# === Step 4: Calculate residential area per LSOA ===
results = {}
areas = []

for _, row in tqdm(lsoa_gdf.iterrows(), total=len(lsoa_gdf)):
    lsoa_code = row["LSOA21CD"]
    lsoa_geom = row.geometry

    inter = lsoa_geom.intersection(residential_union)
    area = inter.area if not inter.is_empty else 0.0

    results[lsoa_code] = area
    areas.append(area)

# === Step 5: Output to JSON ===
with open("lsoa_residential_area.json", "w") as f:
    json.dump(results, f, indent=2)

# === Step 6: Print summary stats ===
areas_array = np.array(areas)
print(f"Mean residential area: {areas_array.mean():,.2f} m²")
print(f"Standard deviation:     {areas_array.std():,.2f} m²")
print(f"Max residential area:   {areas_array.max():,.2f} m²")

100%|██████████| 35672/35672 [2:45:20<00:00,  3.60it/s]  

Mean residential area: 246,665.60 m²
Standard deviation:     217,657.97 m²
Max residential area:   3,032,670.82 m²





In [3]:
import geopandas as gpd
import osmium
import shapely.wkb as wkblib
from shapely.geometry import Polygon, MultiPolygon
from shapely.ops import unary_union
from tqdm import tqdm
import json
import numpy as np

# === Step 1: Read in retail landuse from .pbf ===
class RetailAreaHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.wkb_factory = osmium.geom.WKBFactory()
        self.retail_polygons = []

    def area(self, a):
        try:
            tags = {t.k: t.v for t in a.tags}
            if tags.get("landuse") == "retail":
                wkb = self.wkb_factory.create_multipolygon(a)
                geom = wkblib.loads(wkb, hex=True)
                if isinstance(geom, (Polygon, MultiPolygon)):
                    self.retail_polygons.append(geom)
        except Exception:
            pass  # skip invalid geometries

handler = RetailAreaHandler()
handler.apply_file("landuse_retail.pbf")

# Wrap into GeoDataFrame and project
retail_gdf = gpd.GeoDataFrame(geometry=handler.retail_polygons, crs="EPSG:4326")
retail_gdf = retail_gdf.to_crs(epsg=27700)

# === Step 2: Union all retail polygons to avoid overcounting ===
retail_union = unary_union(retail_gdf.geometry)

# === Step 3: Read LSOA boundaries ===
lsoa_gdf = gpd.read_file("../boundaries/Lower layer Super Output Areas (December 2021) Boundaries EW BFC (V10)/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFC_V10_8562115581115271145/LSOA_2021_EW_BFC_V10.shp")
lsoa_gdf = lsoa_gdf.to_crs(epsg=27700)

# === Step 4: Calculate retail area per LSOA ===
results = {}
areas = []

for _, row in tqdm(lsoa_gdf.iterrows(), total=len(lsoa_gdf)):
    lsoa_code = row["LSOA21CD"]
    lsoa_geom = row.geometry

    inter = lsoa_geom.intersection(retail_union)
    area = inter.area if not inter.is_empty else 0.0

    results[lsoa_code] = area
    areas.append(area)

# === Step 5: Output to JSON ===
with open("lsoa_retail_area.json", "w") as f:
    json.dump(results, f, indent=2)

# === Step 6: Print summary stats ===
areas_array = np.array(areas)
print(f"Mean retail area: {areas_array.mean():,.2f} m²")
print(f"Standard deviation:   {areas_array.std():,.2f} m²")
print(f"Max retail area:      {areas_array.max():,.2f} m²")

100%|██████████| 35672/35672 [19:11<00:00, 30.97it/s]


Mean retail area: 7,167.88 m²
Standard deviation:   24,000.09 m²
Max retail area:      1,283,533.83 m²


In [1]:
### Minmax normalize all 4 land use features
import json
import numpy as np

# ---- Inputs / Output ----
COMMERCIAL_JSON  = "lsoa_commercial_area.json"
INDUSTRIAL_JSON  = "lsoa_industrial_area.json"
RESIDENTIAL_JSON = "lsoa_residential_area.json"
RETAIL_JSON      = "lsoa_retail_area.json"
OUTPUT_JSON      = "lsoa21_land_use_normalized.json"

# ---- Load ----
with open(COMMERCIAL_JSON, "r") as f:
    commercial = json.load(f)
with open(INDUSTRIAL_JSON, "r") as f:
    industrial = json.load(f)
with open(RESIDENTIAL_JSON, "r") as f:
    residential = json.load(f)
with open(RETAIL_JSON, "r") as f:
    retail = json.load(f)

# If all four files cover the same LSOA set (as you indicated), use intersection
keys = sorted(set(commercial) & set(industrial) & set(residential) & set(retail))

# Build feature matrix in fixed order: [commercial, industrial, residential, retail]
mat = np.array([
    [float(commercial[k]), float(industrial[k]), float(residential[k]), float(retail[k])]
    for k in keys
], dtype=float)  # shape: (N, 4)

# # ---- Min–max normalize each feature column ----
# mins = np.min(mat, axis=0)
# maxs = np.max(mat, axis=0)
# rng = np.where((maxs - mins) == 0.0, 1.0, (maxs - mins))
# norm = (mat - mins) / rng  # same shape (N, 4)

# ---- Z-score normalize each feature column ----
means = np.nanmean(mat, axis=0)
stds  = np.nanstd(mat, axis=0)
stds  = np.where(stds == 0.0, 1.0, stds)  # avoid divide-by-zero
norm = (mat - means) / stds  # same shape (N, 4)

# ---- Write out as { "LSOA21CD": [comm_norm, ind_norm, res_norm, retail_norm], ... } ----
out = {k: norm[i].tolist() for i, k in enumerate(keys)}

with open(OUTPUT_JSON, "w") as f:
    json.dump(out, f, indent=2)

print(f"Saved {len(out)} LSOA21 rows → {OUTPUT_JSON}")

Saved 35672 LSOA21 rows → lsoa21_land_use_normalized.json


In [1]:
### Save all 4 land use features WITHOUT normalization
import json
import numpy as np

# ---- Inputs / Output ----
COMMERCIAL_JSON  = "lsoa_commercial_area.json"
INDUSTRIAL_JSON  = "lsoa_industrial_area.json"
RESIDENTIAL_JSON = "lsoa_residential_area.json"
RETAIL_JSON      = "lsoa_retail_area.json"
OUTPUT_JSON      = "lsoa21_land_use_raw.json"

# ---- Load ----
with open(COMMERCIAL_JSON, "r") as f:
    commercial = json.load(f)
with open(INDUSTRIAL_JSON, "r") as f:
    industrial = json.load(f)
with open(RESIDENTIAL_JSON, "r") as f:
    residential = json.load(f)
with open(RETAIL_JSON, "r") as f:
    retail = json.load(f)

# Use intersection of keys to guarantee alignment
keys = sorted(set(commercial) & set(industrial) & set(residential) & set(retail))

# ---- Build feature matrix (no normalization) ----
# Order: [commercial, industrial, residential, retail]
mat = np.array([
    [
        float(commercial[k]),
        float(industrial[k]),
        float(residential[k]),
        float(retail[k]),
    ]
    for k in keys
], dtype=float)  # shape: (N, 4)

# ---- Write out in EXACT same structure as before ----
# { "LSOA21CD": [commercial, industrial, residential, retail], ... }
out = {k: mat[i].tolist() for i, k in enumerate(keys)}

with open(OUTPUT_JSON, "w") as f:
    json.dump(out, f, indent=2)

print(f"Saved {len(out)} LSOA21 rows → {OUTPUT_JSON}")

Saved 35672 LSOA21 rows → lsoa21_land_use_raw.json
