In [24]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

# ===================== INPUT =====================
INPUT_JSON = "GT_AADT_8years_meta.json"

# ===================== COLORS (same 4, re-assigned as requested) =====================
# Blue for Motorway, light green for Motorway link, Orange for Trunk, light orange for Trunk link
C_BLUE = "#1f78b4"
C_LGREEN = "#b2df8a"
C_ORANGE = "#ff7f00"
C_LORANGE = "#fdb462"

TYPE_ORDER = ["motorway", "motorway_link", "trunk", "trunk_link"]  # plotting order
TYPE_LABEL = {
    "motorway": "Motorway",
    "motorway_link": "Motorway link",
    "trunk": "Trunk",
    "trunk_link": "Trunk link",
}
TYPE_COLOR = {
    "motorway": C_BLUE,
    "motorway_link": C_LGREEN,
    "trunk": C_ORANGE,
    "trunk_link": C_LORANGE,
}

# ===================== LOAD =====================
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    meta = json.load(f)

rows = []
for edge_id, rec in meta.items():
    gt = float(rec["gt"])
    sd = float(rec["sd_daily"])
    years_used = rec.get("years_used", [])
    n_days_total = int(rec.get("n_days_total", 0))

    # number of sensors matched to the edge -> max across years
    n_sensors_py = rec.get("n_sensors_per_year", {})
    n_sensors = int(max(n_sensors_py.values())) if len(n_sensors_py) else 0

    rows.append({
        "edge": edge_id,
        "highway_type": rec["highway_type"],
        "gt": gt,
        "sd_daily": sd,
        "cv_ratio": (sd / gt) if gt > 0 else np.nan,
        "n_sensors": n_sensors,
        "n_years": len(years_used),
        "n_days_total": n_days_total
    })

df = pd.DataFrame(rows)
df = df[df["highway_type"].isin(TYPE_LABEL.keys())].copy()

# ===================== TABLE (unchanged; already fine) =====================
def stats_block(x: pd.Series) -> pd.Series:
    return pd.Series({
        "Mean": x.mean(),
        "Std": x.std(),
        "Min": x.min(),
        "Median": x.median(),
        "Max": x.max(),
    })

table_by_type = (
    df.groupby("highway_type")["gt"]
      .apply(stats_block)
      .unstack()
      .loc[["motorway", "trunk", "motorway_link", "trunk_link"]]
)

all_row = stats_block(df["gt"]).to_frame().T
all_row.index = ["All types"]

out_table = pd.concat([all_row, table_by_type], axis=0).reset_index()
out_table = out_table.rename(columns={"index": "Highway type"})
out_table.iloc[1:, 0] = out_table.iloc[1:, 0].map(TYPE_LABEL)

for c in ["Mean", "Std", "Min", "Median", "Max"]:
    out_table[c] = out_table[c].round(1)

out_table.to_csv("table_gt_by_highway_type.csv", index=False)

# ===================== HELPERS =====================
def add_total_labels(xs, totals, fontsize=8):
    for x, tot in zip(xs, totals):
        if tot > 0:
            plt.text(x, tot, f"{int(tot)}", ha="center", va="bottom", fontsize=fontsize)

def stacked_bar_from_counts(x_vals, counts_by_type, xlabel, ylabel, out_pdf, xtick_rotation=0):
    bottom = np.zeros(len(x_vals), dtype=float)
    plt.figure(figsize=(6, 4))
    for t in TYPE_ORDER:
        counts = np.array([counts_by_type[t].get(v, 0) for v in x_vals], dtype=float)
        plt.bar(x_vals, counts, bottom=bottom, color=TYPE_COLOR[t], label=TYPE_LABEL[t])
        bottom += counts

    # add_total_labels(x_vals, bottom, fontsize=8)

    plt.xticks(x_vals, rotation=xtick_rotation)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    # plt.legend()
    plt.tight_layout()
    plt.savefig(out_pdf)
    plt.close()

# ===================== FIG 1: Stacked histogram of Ground truth (AADT) with total labels =====================
gt_all = df["gt"].to_numpy()
bins_gt = np.histogram_bin_edges(gt_all, bins=50)

# convert stacked histogram into stacked bars so we can label totals
bin_edges = bins_gt
bin_centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])
bin_widths = np.diff(bin_edges)

counts_stack = {}
for t in TYPE_ORDER:
    counts_stack[t], _ = np.histogram(df.loc[df["highway_type"] == t, "gt"].to_numpy(), bins=bin_edges)

bottom = np.zeros(len(bin_centers), dtype=float)

plt.figure(figsize=(5, 4))
for t in TYPE_ORDER:
    counts = counts_stack[t].astype(float)
    plt.bar(bin_centers, counts, width=bin_widths, bottom=bottom,
            color=TYPE_COLOR[t], label=TYPE_LABEL[t], align="center")
    bottom += counts

# label total on each bin
# for x, tot in zip(bin_centers, bottom):
#     if tot > 0:
#         plt.text(x, tot, f"{int(tot)}", ha="center", va="bottom", fontsize=7)

plt.xlabel("Ground truth AADT")
plt.ylabel("Number of target edges")
from matplotlib.ticker import FuncFormatter
fmt_k = FuncFormatter(lambda x, pos: f"{x/1000:.0f}k" if x >= 1000 else f"{int(x)}")
plt.gca().xaxis.set_major_formatter(fmt_k)
plt.legend()
plt.tight_layout()
plt.savefig("fig1_stacked_hist_ground_truth_aadt.pdf")
plt.close()

# ===================== FIG 2: Stacked bar of sensors per edge (aggregate >=20 into '20+') =====================
df["n_sensors_capped"] = df["n_sensors"].clip(upper=12)
# represent ">=20" as 20; label it as "20+"
sensor_vals = list(range(int(df["n_sensors_capped"].min()), 13))  # includes 20

counts_by_type = {t: Counter(df.loc[df["highway_type"] == t, "n_sensors_capped"].tolist()) for t in TYPE_ORDER}

bottom = np.zeros(len(sensor_vals), dtype=float)
plt.figure(figsize=(6, 4))
for t in TYPE_ORDER:
    counts = np.array([counts_by_type[t].get(v, 0) for v in sensor_vals], dtype=float)
    plt.bar(sensor_vals, counts, bottom=bottom, color=TYPE_COLOR[t], label=TYPE_LABEL[t])
    bottom += counts

# add_total_labels(sensor_vals, bottom, fontsize=8)

xticklabels = [str(v) for v in sensor_vals]
xticklabels[-1] = "12+"
plt.xticks(sensor_vals, xticklabels)
plt.xlabel("Number of sensors matched to edge")
plt.ylabel("Number of target edges")
# plt.legend()
plt.tight_layout()
plt.savefig("fig2_stacked_bar_number_of_sensors.pdf")
plt.close()

# ===================== FIG 3: Stacked bar of valid years (1..8) with total labels =====================
year_vals = list(range(int(df["n_years"].min()), int(df["n_years"].max()) + 1))
counts_by_type = {t: Counter(df.loc[df["highway_type"] == t, "n_years"].tolist()) for t in TYPE_ORDER}
stacked_bar_from_counts(
    x_vals=year_vals,
    counts_by_type=counts_by_type,
    xlabel="Number of valid years",
    ylabel="Number of target edges",
    out_pdf="fig3_stacked_bar_number_of_valid_years.pdf",
)

# ===================== FIG 4: Stacked bar for total valid days, bin 0..20000 + '20000+' =====================
# - bins: 0..10000 in steps of 1000
# - merge everything above 10000 into "10000+"
# - stacked bar by highway type, annotate total bar heights
BIN_WIDTH = 1000
edges = list(range(0, 10000 + BIN_WIDTH, BIN_WIDTH)) + [np.inf]  # last overflow
labels = [f"{edges[i]}–{edges[i+1]}" for i in range(len(edges)-2)] + ["10000+"]

def bin_days_10000(x):
    # returns bin index (0..len(labels)-1)
    for i in range(len(edges) - 1):
        lo, hi = edges[i], edges[i + 1]
        if (x >= lo) and (x < hi):
            return i
    return len(labels) - 1

df["days_bin"] = df["n_days_total"].apply(bin_days_10000)

counts_by_type = {t: Counter(df.loc[df["highway_type"] == t, "days_bin"].tolist()) for t in TYPE_ORDER}

x_vals = list(range(len(labels)))
bottom = np.zeros(len(x_vals), dtype=float)

plt.figure(figsize=(6, 4))
for t in TYPE_ORDER:
    counts = np.array([counts_by_type[t].get(i, 0) for i in x_vals], dtype=float)
    plt.bar(x_vals, counts, bottom=bottom, color=TYPE_COLOR[t], label=TYPE_LABEL[t])
    bottom += counts

# annotate totals
# for i, tot in enumerate(bottom):
#     if tot > 0:
#         plt.text(i, tot, f"{int(tot)}", ha="center", va="bottom", fontsize=8)

plt.xticks(x_vals, labels, rotation=45, ha="right")
plt.xlabel("Total number of valid days across all years")
plt.ylabel("Number of target edges")
# plt.legend()
plt.tight_layout()
plt.savefig("fig4_stacked_bar_total_valid_days.pdf")
plt.close()

# ===================== FIG 5: Scatter Ground truth AADT vs Daily standard deviation =====================
plt.figure(figsize=(4, 4))
plt.scatter(df["gt"], df["sd_daily"], s=4, alpha=0.25)
plt.xlabel("Ground truth traffic volume (AADT)")
plt.ylabel("Standard deviation of daily total")
from matplotlib.ticker import FuncFormatter
fmt_k = FuncFormatter(lambda x, pos: f"{x/1000:.0f}k" if x >= 1000 else f"{x:.0f}")
ax = plt.gca()
ax.xaxis.set_major_formatter(fmt_k)
ax.yaxis.set_major_formatter(fmt_k)
plt.tight_layout()
plt.savefig("fig5_scatter_ground_truth_vs_daily_sd.pdf")
plt.close()

# ===================== FIG 6: Stacked bar plot for cv ratio, bins 0..1 + '1+' =====================
# - bins: 0..0.5 in steps of 0.05
# - merge everything above 0.5 into "0.50+"
# - show ALL x tick labels (every bar)
CV_BIN_WIDTH = 0.05
cv_edges = list(np.arange(0.0, 0.5 + CV_BIN_WIDTH, CV_BIN_WIDTH)) + [np.inf]  # overflow
cv_labels = [f"{cv_edges[i]:.2f}–{cv_edges[i+1]:.2f}" for i in range(len(cv_edges)-2)] + ["0.50+"]

def bin_cv_050(x):
    for i in range(len(cv_edges) - 1):
        lo, hi = cv_edges[i], cv_edges[i + 1]
        if (x >= lo) and (x < hi):
            return i
    return len(cv_labels) - 1

df_cv = df.dropna(subset=["cv_ratio"]).copy()
df_cv["cv_bin"] = df_cv["cv_ratio"].apply(bin_cv_050)

counts_by_type = {t: Counter(df_cv.loc[df_cv["highway_type"] == t, "cv_bin"].tolist()) for t in TYPE_ORDER}

x_vals = list(range(len(cv_labels)))
bottom = np.zeros(len(x_vals), dtype=float)

plt.figure(figsize=(5, 4))
for t in TYPE_ORDER:
    counts = np.array([counts_by_type[t].get(i, 0) for i in x_vals], dtype=float)
    plt.bar(x_vals, counts, bottom=bottom, color=TYPE_COLOR[t], label=TYPE_LABEL[t])
    bottom += counts

# annotate totals
# for i, tot in enumerate(bottom):
#     if tot > 0:
#         plt.text(i, tot, f"{int(tot)}", ha="center", va="bottom", fontsize=7)

# show ALL tick labels
plt.xticks(x_vals, cv_labels, rotation=45, ha="right")

plt.xlabel("Coefficient of variation")
plt.ylabel("Number of target edges")
# plt.legend()
plt.tight_layout()
plt.savefig("fig6_stacked_bar_cv_ratio.pdf")
plt.close()

In [5]:
import numpy as np

# assumes df has column "cv_ratio" = sd_daily / gt
mean_cv = float(np.nanmean(df["cv_ratio"].values))
print(f"Mean coefficient of variation: {mean_cv:.4f}")

Mean coefficient of variation: 0.2375


In [1]:
#!/usr/bin/env python3
import json
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import rasterio
from rasterio.windows import from_bounds
from rasterio.enums import Resampling
from pathlib import Path
from matplotlib.collections import LineCollection
from matplotlib.colors import LinearSegmentedColormap

# ========================= INPUTS =========================
BASEMAP_TIF   = "../basemap/GBOverview.tif"  # EPSG:27700
EDGES_GEOJSON = "../highway_network/uk_driving_edges_simplified.geojson"
GT_JSON       = "GT_AADT_8years.json"

# England-ish bbox in EPSG:27700
ENGLAND_BBOX_27700 = (0, 0, 700000, 700000)  # (xmin, ymin, xmax, ymax)

OUT_DIR = Path("descriptives")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_NORTH  = OUT_DIR / "gt_edges_northbound.pdf"
OUT_SOUTH  = OUT_DIR / "gt_edges_southbound.pdf"
OUT_LEGEND = OUT_DIR / "gt_edges_colorbar_legend.pdf"

# Style
FIGSIZE = (10, 10)
LINEWIDTH = 4
BASEMAP_MAX_PIX = 3000
ALPHA = 0.9
TILES_ALPHA = 0.5

# If edges geojson CRS is missing, assume lon/lat
EDGES_ASSUME_CRS_IF_MISSING = "EPSG:4326"

# Colormap
BASE_CMAP = "turbo"
# Cut extremes: keep only middle portion of turbo to avoid very dark blue/red
CMAP_TRIM = (0.12, 0.88)   # (low, high) in [0,1] — adjust if you want more/less trimming

# Clip extremes for value scaling (robust)
VCLIP_Q = (0.00, 1)


# ========================= HELPERS =========================
def read_basemap_crop_27700(tif_path: str, bbox_27700, max_pix: int):
    xmin, ymin, xmax, ymax = bbox_27700
    with rasterio.open(tif_path) as src:
        if src.crs is None or src.crs.to_epsg() != 27700:
            raise ValueError("Basemap must have CRS EPSG:27700.")
        win = from_bounds(xmin, ymin, xmax, ymax, transform=src.transform).round_offsets().round_lengths()
        win = win.intersection(rasterio.windows.Window(0, 0, src.width, src.height))
        scale = max(win.width / max_pix, win.height / max_pix, 1.0)
        out_w = int(max(1, win.width / scale))
        out_h = int(max(1, win.height / scale))
        data = src.read(window=win, out_shape=(src.count, out_h, out_w), resampling=Resampling.bilinear)
        img = np.transpose(data, (1, 2, 0))
        left, bottom, right, top = rasterio.windows.bounds(win, src.transform)
        extent = (left, right, bottom, top)
        return img, extent

def ensure_27700(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    if gdf.crs is None:
        gdf = gdf.set_crs(EDGES_ASSUME_CRS_IF_MISSING)
    if gdf.crs.to_epsg() != 27700:
        gdf = gdf.to_crs(27700)
    return gdf

def _geom_to_segments(geom):
    if geom is None or geom.is_empty:
        return []
    gt = geom.geom_type
    if gt == "LineString":
        return [np.asarray(geom.coords, dtype=float)]
    if gt == "MultiLineString":
        return [np.asarray(g.coords, dtype=float) for g in geom.geoms if (g is not None and not g.is_empty)]
    return []

def add_lines_colored(ax, gdf, values, cmap, vmin, vmax, linewidth, alpha):
    cm = plt.get_cmap(cmap)
    segs, cols = [], []
    for geom, val in zip(gdf.geometry.values, values):
        if not np.isfinite(val):
            continue
        norm = (val - vmin) / (vmax - vmin) if vmax > vmin else 0.5
        norm = float(np.clip(norm, 0.0, 1.0))
        r, g, b, _ = cm(norm)
        for s in _geom_to_segments(geom):
            if s.shape[0] >= 2:
                segs.append(s)
                cols.append((r, g, b, alpha))
    if not segs:
        return None
    lc = LineCollection(segs, colors=cols, linewidths=linewidth, capstyle="round", joinstyle="round")
    ax.add_collection(lc)
    return lc

def parse_edge_id(eid: str):
    p = str(eid).split("_")
    if len(p) < 3:
        raise ValueError(f"Bad edge_id (expected u_v_key): {eid}")
    return int(p[0]), int(p[1]), int(p[2])

def infer_direction(geom):
    """northbound if start is more south than end (y_start < y_end)."""
    segs = _geom_to_segments(geom)
    if not segs:
        return None
    s = segs[0]
    y0 = float(s[0, 1])
    y1 = float(s[-1, 1])
    return "north" if y0 < y1 else "south"

def trimmed_colormap(base_cmap: str, lo: float, hi: float, name="turbo_trimmed"):
    base = plt.get_cmap(base_cmap)
    xs = np.linspace(lo, hi, 256)
    cols = base(xs)
    return LinearSegmentedColormap.from_list(name, cols)


# ========================= LOAD GT =========================
with open(GT_JSON, "r", encoding="utf-8") as f:
    gt_map = json.load(f)

gt_rows = []
for eid, val in gt_map.items():
    u, v, k = parse_edge_id(eid)
    gt_rows.append((u, v, k, float(val)))
df_gt = pd.DataFrame(gt_rows, columns=["u", "v", "key", "gt"])

# ========================= LOAD EDGES =========================
edges = gpd.read_file(EDGES_GEOJSON)
edges = ensure_27700(edges)

need = {"u", "v", "key"}
if not need.issubset(set(edges.columns)):
    cand = None
    for c in ["edge_id", "eid", "id", "osmid", "fid"]:
        if c in edges.columns:
            cand = c
            break
    if cand is None:
        raise ValueError("Edges GeoJSON must contain columns u,v,key OR an edge_id-like column.")
    parsed = edges[cand].astype(str).apply(parse_edge_id)
    edges["u"] = parsed.apply(lambda x: x[0])
    edges["v"] = parsed.apply(lambda x: x[1])
    edges["key"] = parsed.apply(lambda x: x[2])

g2 = edges.merge(df_gt, on=["u", "v", "key"], how="inner")
if len(g2) == 0:
    raise RuntimeError("No edges matched between GeoJSON and GT JSON (check id consistency).")

xmin, ymin, xmax, ymax = ENGLAND_BBOX_27700
g2 = g2.cx[xmin:xmax, ymin:ymax].copy()

g2["dir"] = g2.geometry.apply(infer_direction)
north = g2[g2["dir"] == "north"].copy()
south = g2[g2["dir"] == "south"].copy()

vals_all = g2["gt"].to_numpy(dtype=float)
qlo, qhi = np.quantile(vals_all[np.isfinite(vals_all)], VCLIP_Q)
vmin, vmax = float(qlo), float(qhi)

# Basemap crop once
basemap_img, basemap_extent = read_basemap_crop_27700(BASEMAP_TIF, ENGLAND_BBOX_27700, BASEMAP_MAX_PIX)
if basemap_img.ndim == 3 and basemap_img.shape[2] >= 3:
    basemap_img = np.dot(basemap_img[..., :3], [0.299, 0.587, 0.114])

# Trimmed colormap object
CMAP_OBJ = trimmed_colormap(BASE_CMAP, CMAP_TRIM[0], CMAP_TRIM[1], name="turbo_trimmed")

def plot_one(gdf, out_path, title):
    fig, ax = plt.subplots(figsize=FIGSIZE)
    ax.imshow(
        basemap_img,
        extent=basemap_extent,
        cmap="gray",
        vmin=0,
        vmax=255,
        alpha=TILES_ALPHA
    )
    ax.set_xlim(basemap_extent[0], basemap_extent[1])
    ax.set_ylim(basemap_extent[2], basemap_extent[3])

    add_lines_colored(
        ax=ax,
        gdf=gdf,
        values=gdf["gt"].to_numpy(dtype=float),
        cmap=CMAP_OBJ,
        vmin=vmin,
        vmax=vmax,
        linewidth=LINEWIDTH,
        alpha=ALPHA
    )

    ax.set_axis_off()
    ax.set_title(title)
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()
    print(f"Saved: {out_path} (n={len(gdf)})")

def plot_standalone_colorbar(out_path, cmap_obj, vmin, vmax, label="AADT"):
    fig = plt.figure(figsize=(1.5, 6.0))
    ax = fig.add_axes([0.35, 0.05, 0.25, 0.9])  # [left, bottom, width, height]
    sm = plt.cm.ScalarMappable(cmap=cmap_obj)
    sm.set_clim(vmin, vmax)
    cbar = fig.colorbar(sm, cax=ax)
    cbar.set_label(label)
    plt.savefig(out_path)
    plt.close()
    print(f"Saved: {out_path}")

plot_one(north, OUT_NORTH, "Ground truth traffic volume (AADT) — Northbound edges")
plot_one(south, OUT_SOUTH, "Ground truth traffic volume (AADT) — Southbound edges")
plot_standalone_colorbar(OUT_LEGEND, CMAP_OBJ, vmin, vmax, label="Ground truth traffic volume (AADT)")

Saved: descriptives\gt_edges_northbound.pdf (n=2531)
Saved: descriptives\gt_edges_southbound.pdf (n=2557)
Saved: descriptives\gt_edges_colorbar_legend.pdf


In [18]:
#!/usr/bin/env python3
import json
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import rasterio
from rasterio.windows import from_bounds
from rasterio.enums import Resampling
from pathlib import Path
from matplotlib.collections import LineCollection
from matplotlib.colors import LinearSegmentedColormap

# ========================= INPUTS =========================
BASEMAP_TIF   = "../basemap/GBOverview.tif"  # EPSG:27700
EDGES_GEOJSON = "../highway_network/uk_driving_edges_simplified.geojson"
GT_JSON       = "GT_AADT_8years.json"

# England-ish bbox in EPSG:27700
ENGLAND_BBOX_27700 = (0, 0, 700000, 700000)  # (xmin, ymin, xmax, ymax)

OUT_DIR = Path("descriptives")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_ALL    = OUT_DIR / "gt_edges_all.pdf"
OUT_LEGEND = OUT_DIR / "gt_edges_colorbar_legend.svg"

# Style
FIGSIZE = (10, 10)
LINEWIDTH = 4
BASEMAP_MAX_PIX = 3000
ALPHA = 0.9
TILES_ALPHA = 0.5

# If edges geojson CRS is missing, assume lon/lat
EDGES_ASSUME_CRS_IF_MISSING = "EPSG:4326"

# Colormap
BASE_CMAP = "turbo"
# Cut extremes: keep only middle portion of turbo to avoid very dark blue/red
CMAP_TRIM = (0.02, 0.98)   # (low, high) in [0,1]

# Clip extremes for value scaling (robust)
VCLIP_Q = (0.00, 1.00)


# ========================= HELPERS =========================
def read_basemap_crop_27700(tif_path: str, bbox_27700, max_pix: int):
    xmin, ymin, xmax, ymax = bbox_27700
    with rasterio.open(tif_path) as src:
        if src.crs is None or src.crs.to_epsg() != 27700:
            raise ValueError("Basemap must have CRS EPSG:27700.")
        win = from_bounds(xmin, ymin, xmax, ymax, transform=src.transform).round_offsets().round_lengths()
        win = win.intersection(rasterio.windows.Window(0, 0, src.width, src.height))
        scale = max(win.width / max_pix, win.height / max_pix, 1.0)
        out_w = int(max(1, win.width / scale))
        out_h = int(max(1, win.height / scale))
        data = src.read(window=win, out_shape=(src.count, out_h, out_w), resampling=Resampling.bilinear)
        img = np.transpose(data, (1, 2, 0))
        left, bottom, right, top = rasterio.windows.bounds(win, src.transform)
        extent = (left, right, bottom, top)
        return img, extent

def ensure_27700(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    if gdf.crs is None:
        gdf = gdf.set_crs(EDGES_ASSUME_CRS_IF_MISSING)
    if gdf.crs.to_epsg() != 27700:
        gdf = gdf.to_crs(27700)
    return gdf

def _geom_to_segments(geom):
    if geom is None or geom.is_empty:
        return []
    gt = geom.geom_type
    if gt == "LineString":
        return [np.asarray(geom.coords, dtype=float)]
    if gt == "MultiLineString":
        return [np.asarray(g.coords, dtype=float) for g in geom.geoms if (g is not None and not g.is_empty)]
    return []

# --- add this helper (or replace your existing add_lines_colored with this version) ---
def add_lines_colored(ax, gdf, value_col, cmap, vmin, vmax, linewidth, alpha, ascending=True):
    """
    Draw lines in sorted order so later (higher) values are plotted on top.
    ascending=True => low first, high last (high on top).
    """
    cm = plt.get_cmap(cmap)

    # sort by value
    gdf2 = gdf[[value_col, "geometry"]].copy()
    gdf2 = gdf2[np.isfinite(gdf2[value_col].to_numpy(dtype=float))]
    gdf2 = gdf2.sort_values(value_col, ascending=ascending, kind="mergesort")  # stable sort

    segs, cols = [], []
    for geom, val in zip(gdf2.geometry.values, gdf2[value_col].to_numpy(dtype=float)):
        norm = (val - vmin) / (vmax - vmin) if vmax > vmin else 0.5
        norm = float(np.clip(norm, 0.0, 1.0))
        r, g, b, _ = cm(norm)
        for s in _geom_to_segments(geom):
            if s.shape[0] >= 2:
                segs.append(s)
                cols.append((r, g, b, alpha))

    if not segs:
        return None

    lc = LineCollection(
        segs,
        colors=cols,
        linewidths=linewidth,
        capstyle="round",
        joinstyle="round"
    )
    ax.add_collection(lc)
    return lc

def parse_edge_id(eid: str):
    p = str(eid).split("_")
    if len(p) < 3:
        raise ValueError(f"Bad edge_id (expected u_v_key): {eid}")
    return int(p[0]), int(p[1]), int(p[2])

def trimmed_colormap(base_cmap: str, lo: float, hi: float, name="turbo_trimmed"):
    base = plt.get_cmap(base_cmap)
    xs = np.linspace(lo, hi, 256)
    cols = base(xs)
    return LinearSegmentedColormap.from_list(name, cols)


# ========================= LOAD GT =========================
with open(GT_JSON, "r", encoding="utf-8") as f:
    gt_map = json.load(f)

gt_rows = []
for eid, val in gt_map.items():
    u, v, k = parse_edge_id(eid)
    gt_rows.append((u, v, k, float(val)))
df_gt = pd.DataFrame(gt_rows, columns=["u", "v", "key", "gt"])

# ========================= LOAD EDGES =========================
# edges = gpd.read_file(EDGES_GEOJSON)
edges = ensure_27700(edges)

need = {"u", "v", "key"}
if not need.issubset(set(edges.columns)):
    cand = None
    for c in ["edge_id", "eid", "id", "osmid", "fid"]:
        if c in edges.columns:
            cand = c
            break
    if cand is None:
        raise ValueError("Edges GeoJSON must contain columns u,v,key OR an edge_id-like column.")
    parsed = edges[cand].astype(str).apply(parse_edge_id)
    edges["u"] = parsed.apply(lambda x: x[0])
    edges["v"] = parsed.apply(lambda x: x[1])
    edges["key"] = parsed.apply(lambda x: x[2])

g2 = edges.merge(df_gt, on=["u", "v", "key"], how="inner")
if len(g2) == 0:
    raise RuntimeError("No edges matched between GeoJSON and GT JSON (check id consistency).")

# crop to England-ish bbox
xmin, ymin, xmax, ymax = ENGLAND_BBOX_27700
g2 = g2.cx[xmin:xmax, ymin:ymax].copy()

# robust scaling bounds
vals_all = g2["gt"].to_numpy(dtype=float)
qlo, qhi = np.quantile(vals_all[np.isfinite(vals_all)], VCLIP_Q)
vmin, vmax = float(qlo), float(qhi)

# Basemap crop once
basemap_img, basemap_extent = read_basemap_crop_27700(BASEMAP_TIF, ENGLAND_BBOX_27700, BASEMAP_MAX_PIX)
if basemap_img.ndim == 3 and basemap_img.shape[2] >= 3:
    basemap_img = np.dot(basemap_img[..., :3], [0.299, 0.587, 0.114])

# Trimmed colormap object
CMAP_OBJ = trimmed_colormap(BASE_CMAP, CMAP_TRIM[0], CMAP_TRIM[1], name="turbo_trimmed")

def plot_all(gdf, out_path, title):
    fig, ax = plt.subplots(figsize=FIGSIZE)
    ax.imshow(
        basemap_img,
        extent=basemap_extent,
        cmap="gray",
        vmin=0,
        vmax=255,
        alpha=TILES_ALPHA
    )
    ax.set_xlim(basemap_extent[0], basemap_extent[1])
    ax.set_ylim(basemap_extent[2], basemap_extent[3])

    add_lines_colored(
        ax=ax,
        gdf=gdf,
        value_col="gt",
        cmap=CMAP_OBJ,
        vmin=vmin,
        vmax=vmax,
        linewidth=LINEWIDTH,
        alpha=ALPHA,
        ascending=True   # low first, high last => high on top
    )

    ax.set_axis_off()
    ax.set_title(title)
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()
    print(f"Saved: {out_path} (n={len(gdf)})")

from matplotlib.ticker import FuncFormatter

def thousands_formatter(x, pos):
    if x >= 1000:
        return f"{int(x/1000)}k"
    else:
        return f"{int(x)}"

def plot_standalone_colorbar(out_path, cmap_obj, vmin, vmax, label="AADT"):
    fig = plt.figure(figsize=(1.1, 4.0))
    ax = fig.add_axes([0.35, 0.05, 0.25, 0.9])  # [left, bottom, width, height]

    sm = plt.cm.ScalarMappable(cmap=cmap_obj)
    sm.set_clim(vmin, vmax)

    cbar = fig.colorbar(sm, cax=ax)
    cbar.set_label(label)
    cbar.ax.yaxis.set_major_formatter(FuncFormatter(thousands_formatter))

    plt.savefig(out_path)
    plt.close()
    print(f"Saved: {out_path}")

plot_all(g2, OUT_ALL, "Ground truth traffic volume (AADT) — All edges")
plot_standalone_colorbar(OUT_LEGEND, CMAP_OBJ, vmin, vmax, label="Ground truth traffic volume (AADT)")

Saved: descriptives\gt_edges_all.pdf (n=5088)
Saved: descriptives\gt_edges_colorbar_legend.svg


In [21]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ===================== INPUT =====================
INPUT_JSON = "GT_AADT_8years_meta.json"

# ===================== COLORS (same 4) =====================
C_BLUE    = "#1f78b4"   # motorway
C_LGREEN  = "#b2df8a"   # motorway_link
C_ORANGE  = "#ff7f00"   # trunk
C_LORANGE = "#fdb462"   # trunk_link

TYPE_ORDER = ["motorway", "motorway_link", "trunk", "trunk_link"]
TYPE_LABEL = {
    "motorway": "Motorway",
    "motorway_link": "Motorway link",
    "trunk": "Trunk",
    "trunk_link": "Trunk link",
}
TYPE_COLOR = {
    "motorway": C_BLUE,
    "motorway_link": C_LGREEN,
    "trunk": C_ORANGE,
    "trunk_link": C_LORANGE,
}

# ===================== LOAD =====================
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    meta = json.load(f)

rows = []
for edge_id, rec in meta.items():
    gt = float(rec["gt"])
    sd = float(rec["sd_daily"])
    years_used = rec.get("years_used", [])
    n_days_total = int(rec.get("n_days_total", 0))

    n_sensors_py = rec.get("n_sensors_per_year", {})
    n_sensors = int(max(n_sensors_py.values())) if len(n_sensors_py) else 0

    rows.append({
        "edge": edge_id,
        "highway_type": rec["highway_type"],
        "gt": gt,
        "sd_daily": sd,
        "cv_ratio": (sd / gt) if gt > 0 else np.nan,
        "n_sensors": n_sensors,
        "n_years": len(years_used),
        "n_days_total": n_days_total
    })

df = pd.DataFrame(rows)
df = df[df["highway_type"].isin(TYPE_LABEL.keys())].copy()

# ===================== BOX PLOT (outline-only, colored lines) =====================
data   = [df.loc[df["highway_type"] == t, "gt"].dropna().values for t in TYPE_ORDER]
labels = [TYPE_LABEL[t] for t in TYPE_ORDER]
colors = [TYPE_COLOR[t] for t in TYPE_ORDER]
means  = [np.mean(arr) if len(arr) else np.nan for arr in data]

fig, ax = plt.subplots(figsize=(4, 4))

bp = ax.boxplot(
    data,
    labels=labels,
    patch_artist=True,   # needed to control box face/edge
    showfliers=True,
    whis=1.5
)

# ---- Outline-only boxes: no fill, colored borders ----
for patch, col in zip(bp["boxes"], colors):
    patch.set_facecolor("none")
    patch.set_edgecolor(col)
    patch.set_linewidth(1.8)

# ---- Medians: colored ----
for median, col in zip(bp["medians"], colors):
    median.set_color(col)
    median.set_linewidth(1.8)

# ---- Whiskers & caps: colored (2 per box) ----
for i, col in enumerate(colors):
    # whiskers
    bp["whiskers"][2*i].set_color(col)
    bp["whiskers"][2*i + 1].set_color(col)
    bp["whiskers"][2*i].set_linewidth(1.4)
    bp["whiskers"][2*i + 1].set_linewidth(1.4)

    # caps
    bp["caps"][2*i].set_color(col)
    bp["caps"][2*i + 1].set_color(col)
    bp["caps"][2*i].set_linewidth(1.4)
    bp["caps"][2*i + 1].set_linewidth(1.4)

# ---- Optional: fliers (if showfliers=True) ----
if "fliers" in bp:
    for flier, col in zip(bp["fliers"], colors):
        flier.set_markeredgecolor(col)
        flier.set_markerfacecolor("none")
        flier.set_alpha(0.4)

# ---- Add dotted mean line (colored to match type) ----
for i, (box, m, col) in enumerate(zip(bp["boxes"], means, colors), start=1):
    if not np.isfinite(m):
        continue
    verts = box.get_path().vertices
    q1 = float(np.min(verts[:, 1]))
    q3 = float(np.max(verts[:, 1]))

    # draw a dotted segment: longer if mean is within IQR, shorter otherwise
    if q1 <= m <= q3:
        xmin, xmax = i - 0.28, i + 0.28
    else:
        xmin, xmax = i - 0.12, i + 0.12

    ax.hlines(
        y=m,
        xmin=xmin,
        xmax=xmax,
        colors=col,
        linestyles="dotted",
        linewidth=1.6,
        zorder=3
    )

ax.set_ylabel("Ground truth traffic volume (AADT)")
# ax.set_title("AADT distribution by highway type")
ax.grid(axis="y", alpha=0.25)
plt.xticks(rotation=45, ha="right")
from matplotlib.ticker import FuncFormatter

ax.yaxis.set_major_formatter(
    FuncFormatter(lambda x, pos: f"{int(x/1000)}k" if x >= 1000 else f"{int(x)}")
)
plt.tight_layout()
plt.savefig("boxplot_gt_by_highway_type.pdf")
plt.savefig("boxplot_gt_by_highway_type.png", dpi=300)
plt.close()

print("Saved: boxplot_gt_by_highway_type.pdf and .png")

Saved: boxplot_gt_by_highway_type.pdf and .png


  bp = ax.boxplot(
