In [None]:
#!/usr/bin/env python3
# build_GT_multi_year_with_uncertainty.py
#
# Goal:
#   Build multi-year edge-level GT (AADT-like total volume) and an uncertainty
#   meta file using per-sensor, per-year summaries:
#     - years: [2017, 2018, 2019, 2022, 2023, 2024]
#     - per sensor & year: from traffic_volume_summary_<year>.json we use
#         * "Total Volume": { "mean", "sd", "num_valid_days" }
#   For each edge:
#     1) For each year:
#          - select sensors on that edge
#          - filter by num_valid_days >= MIN_DAYS
#          - remove sensors with mean <= 0 (zero GT never participates)
#          - aggregate sensor-level "mean" to an edge-level yearly GT using
#            SENSOR_AGG ("mean" or "median").
#     2) Across years:
#          - aggregate yearly GT values using YEAR_AGG ("mean" or "median")
#          - remove edges with final GT == 0
#     3) Compute a pooled daily SD over *all* sensors and years:
#          - treat each sensor-year as (mean μ_sy, sd s_sy, n_sy days)
#          - weighted overall mean μ* using n_sy
#          - pooled variance:
#              Var* = [ Σ (n_sy - 1) s_sy^2 + Σ n_sy (μ_sy - μ*)^2 ] / (Σ n_sy - 1)
#          - SD* = sqrt(Var*)
#          - CV* = SD* / GT_final
#
# Outputs:
#   - GT_total_multi_year.json            (edge_id -> GT float)
#   - GT_total_multi_year_meta.json      (edge_id -> dict with GT, per-year GT,
#                                         counts, SD, CV, etc.)
#
# Notes:
#   - All filtering by highway type and sensor-day count is done BEFORE
#     any aggregation.
#   - Sensor-year entries with mean <= 0 are ignored (GT=0 never used).

import json
import numpy as np
import geopandas as gpd
from pathlib import Path

# -----------------------------
# Config
# -----------------------------
YEARS = [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

GEOJSON_PATH   = "../highway_network/uk_driving_edges_simplified.geojson"
EDGE_TO_SENSOR = "edge_to_sensor.json"

OUT_GT_JSON   = "GT_car_pm_8years.json"
OUT_META_JSON = "GT_car_pm_8years_meta.json"

# Min number of complete days required per sensor per year
MIN_DAYS = 200
CAR_TYPE = "0 - 520 cm"
SUMMARY_PATH = "weekday_am_pm/evening_peak_weekdays"

# Highway types to keep
ALLOWED = {"motorway", "trunk", "motorway_link", "trunk_link"}

# Aggregation choices:
#   - SENSOR_AGG: how to aggregate multiple sensors on the same edge in a given year
#   - YEAR_AGG:   how to aggregate multiple yearly values per edge
# Options: "mean" or "median"
SENSOR_AGG = "median"   # e.g. "mean" or "median"
YEAR_AGG   = "median"   # e.g. "mean" or "median"


# -----------------------------
# Small helper for aggregation
# -----------------------------
def aggregate(vals, mode: str):
    """Aggregate a list of floats using 'mean' or 'median'. Return None if empty."""
    if not vals:
        return None
    arr = np.asarray(vals, dtype=float)
    if mode == "mean":
        return float(np.mean(arr))
    elif mode == "median":
        return float(np.median(arr))
    else:
        raise ValueError(f"Unknown aggregation mode: {mode}")


# -----------------------------
# Load network
# -----------------------------
print("Loading network GeoJSON...")
# gdf = gpd.read_file(GEOJSON_PATH)

gdf["edge_id"] = (
    gdf["u"].astype(str) + "_" +
    gdf["v"].astype(str) + "_" +
    gdf["key"].astype(str)
)
gdf["highway"] = gdf["highway"].astype(str).str.lower()
edge_to_highway = dict(zip(gdf["edge_id"], gdf["highway"]))


# -----------------------------
# Load edge-to-sensor mapping
# -----------------------------
print("Loading edge_to_sensor mapping...")
with open(EDGE_TO_SENSOR, "r") as f:
    edge_to_sensor = json.load(f)

n_total_edges_map = len(edge_to_sensor)

# Audit counters
n_missing_in_network = 0
n_excluded_highway   = 0
n_no_valid_sensor    = 0
n_zero_gt_removed    = 0

# -----------------------------
# Load all year summaries
# -----------------------------
summary_by_year = {}

for yr in YEARS:
    path = SUMMARY_PATH + f"_{yr}.json"
    try:
        with open(path, "r") as f:
            summary_by_year[yr] = json.load(f)
        print(f"Loaded summary for {yr} ({len(summary_by_year[yr])} sensors)")
    except FileNotFoundError:
        raise RuntimeError(f"Missing file: {path}")


# -----------------------------
# Main build: GT + meta
# -----------------------------
gt_out   = {}  # edge_id -> final GT
meta_out = {}  # edge_id -> meta dict

for edge_id, sensor_ids in edge_to_sensor.items():

    # 1) Ensure edge belongs to network
    hwy = edge_to_highway.get(edge_id)
    if hwy is None:
        n_missing_in_network += 1
        continue

    # 2) Highway filter
    if hwy not in ALLOWED:
        n_excluded_highway += 1
        continue

    # per-edge containers
    gt_per_year             = {}  # year -> GT_year
    n_sensors_per_year      = {}  # year -> #sensors used
    n_days_per_year         = {}  # year -> total days across used sensors
    sensor_year_means   = []      # list of μ_sy over all sensors & years
    sensor_year_sds     = []      # list of s_sy
    sensor_year_ndays   = []      # list of n_sy

    # 3) Compute year-level GTs and collect sensor-year stats
    for yr in YEARS:
        ts = summary_by_year[yr]

        stats_list = []  # [(mean, sd, n_days), ...] for this edge & year
        for sid in sensor_ids:
            s = str(sid)
            rec = ts.get(s)
            if not rec:
                continue

            band = rec.get(CAR_TYPE)
            if not band:
                continue

            n_days = band.get("num_valid_days", 0)
            if n_days < MIN_DAYS:
                continue

            m = band.get("mean")
            sd = band.get("sd")

            # remove sensor-year entries with non-positive mean (GT=0 or negative)
            if m is None or m <= 0:
                continue

            # sd can be None (should not be, but be safe)
            if sd is None:
                continue

            stats_list.append((float(m), float(sd), int(n_days)))

        if not stats_list:
            # no valid sensor for this edge in this year
            continue

        # Sensor-level means for this year (for GT aggregation)
        sensor_means_this_year = [m for (m, sd, nd) in stats_list]
        gt_year = aggregate(sensor_means_this_year, SENSOR_AGG)

        # ignore year-level GT if it ends up <= 0 (shouldn't happen if we filtered m>0)
        if gt_year is None or gt_year <= 0:
            continue

        gt_per_year[yr]        = float(gt_year)
        n_sensors_per_year[yr] = len(stats_list)
        n_days_per_year[yr]    = int(sum(nd for (_, _, nd) in stats_list))

        # append sensor-year stats to global lists for pooled variance
        for (m, sd, nd) in stats_list:
            sensor_year_means.append(m)
            sensor_year_sds.append(sd)
            sensor_year_ndays.append(nd)

    # If no valid year for this edge, we drop it
    if not gt_per_year:
        n_no_valid_sensor += 1
        continue

    # 4) Final GT = YEAR_AGG across all available years
    final_gt = aggregate(list(gt_per_year.values()), YEAR_AGG)

    if final_gt is None or final_gt == 0.0:
        n_zero_gt_removed += 1
        continue

    # 5) Pooled daily variance across all sensors and years
    sd_daily = None
    cv_daily = None
    n_days_total = int(sum(sensor_year_ndays)) if sensor_year_ndays else 0

    if sensor_year_ndays and n_days_total > 1:
        # Weighted overall mean μ* = Σ n_sy μ_sy / Σ n_sy
        num_mu = 0.0
        for n_sy, mu_sy in zip(sensor_year_ndays, sensor_year_means):
            num_mu += n_sy * mu_sy
        mu_star = num_mu / n_days_total

        # Pooled variance numerator:
        #   Σ (n_sy - 1) s_sy^2  +  Σ n_sy (μ_sy - μ*)^2
        num_var = 0.0
        for n_sy, mu_sy, sd_sy in zip(sensor_year_ndays, sensor_year_means, sensor_year_sds):
            if n_sy <= 1:
                continue
            num_var += (n_sy - 1) * (sd_sy ** 2)
            num_var += n_sy * ((mu_sy - mu_star) ** 2)

        if num_var >= 0 and n_days_total > 1:
            var_daily = num_var / (n_days_total - 1)
            if var_daily < 0:
                var_daily = 0.0
            sd_daily = float(np.sqrt(var_daily))
            if final_gt > 0:
                cv_daily = float(sd_daily / final_gt)

    # 6) Store outputs
    gt_out[edge_id] = float(final_gt)

    meta_out[edge_id] = {
        "gt": float(final_gt),
        "years_used": sorted(int(y) for y in gt_per_year.keys()),
        "gt_per_year": {str(y): float(v) for y, v in gt_per_year.items()},
        "n_sensors_per_year": {str(y): int(n) for y, n in n_sensors_per_year.items()},
        "n_days_per_year": {str(y): int(n) for y, n in n_days_per_year.items()},
        "n_days_total": int(n_days_total),
        "sd_daily": sd_daily,
        "cv_daily": cv_daily,
        "sensor_agg": SENSOR_AGG,
        "year_agg": YEAR_AGG,
        "min_days": MIN_DAYS,
        "highway_type": hwy,
    }


# -----------------------------
# Save result
# -----------------------------
with open(OUT_GT_JSON, "w") as f:
    json.dump(gt_out, f, indent=2)

with open(OUT_META_JSON, "w") as f:
    json.dump(meta_out, f, indent=2)

# -----------------------------
# Report
# -----------------------------
print("\n==== Multi-year GT build report ====")
print(f"Years included: {YEARS}")
print(f"SENSOR_AGG (per-year sensor→edge): {SENSOR_AGG}")
print(f"YEAR_AGG   (across years):         {YEAR_AGG}")
print(f"MIN_DAYS per sensor per year:      {MIN_DAYS}")
print(f"Total edges in edge_to_sensor:     {n_total_edges_map}")
print(f" - Missing in GeoJSON network:     {n_missing_in_network}")
print(f" - Excluded by highway:            {n_excluded_highway}")
print(f" - No valid sensor across years:   {n_no_valid_sensor}")
print(f" - GT == 0 removed:                {n_zero_gt_removed}")
print(f"Final saved edges:                 {len(gt_out)}")
print(f"Output GT   -> {OUT_GT_JSON}")
print(f"Output meta -> {OUT_META_JSON}")

Loading network GeoJSON...
Loading edge_to_sensor mapping...
Loaded summary for 2017 (8913 sensors)
Loaded summary for 2018 (8763 sensors)
Loaded summary for 2019 (8794 sensors)
Loaded summary for 2020 (9061 sensors)
Loaded summary for 2021 (8930 sensors)
Loaded summary for 2022 (9083 sensors)
Loaded summary for 2023 (8862 sensors)
Loaded summary for 2024 (8695 sensors)

==== Multi-year GT build report ====
Years included: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
SENSOR_AGG (per-year sensor→edge): median
YEAR_AGG   (across years):         median
MIN_DAYS per sensor per year:      200
Total edges in edge_to_sensor:     5782
 - Missing in GeoJSON network:     0
 - Excluded by highway:            47
 - No valid sensor across years:   868
 - GT == 0 removed:                0
Final saved edges:                 4867
Output GT   -> GT_total_pm_8years.json
Output meta -> GT_total_pm_8years_meta.json


In [None]:
import json
from pathlib import Path

# -------------------------
# File paths
# -------------------------
DIR = Path(".")

base_file = DIR / "GT_AADT_8years.json"
am_file   = DIR / "GT_total_am_8years.json"
pm_file   = DIR / "GT_total_pm_8years.json"

# -------------------------
# Load json keys
# -------------------------
with open(base_file, "r") as f:
    base = set(json.load(f).keys())

with open(am_file, "r") as f:
    am = set(json.load(f).keys())

with open(pm_file, "r") as f:
    pm = set(json.load(f).keys())

# -------------------------
# Compare
# -------------------------
missing_am = sorted(am - base)
missing_pm = sorted(pm - base)

print("=== AM keys not in base ===")
for k in missing_am:
    print(k)

print("\n=== PM keys not in base ===")
for k in missing_pm:
    print(k)

=== AM keys not in base ===

=== PM keys not in base ===
