In [1]:
#!/usr/bin/env python3
"""
Summarise *best* validation metrics per spatial-CV region from logs.

Assumptions
-----------
- You have 9 log files:
    logs/spatial_cv_1.log ... logs/spatial_cv_9.log
- Validation lines look like:
    Epoch X Step Y Validation Eval: RMSE: a, MAE: b, MGEH: c, R2: d (optional stage/lr)

Output
------
- Prints a table (one row per region) with:
    best_val_RMSE (min), best_val_MAE (min), best_val_MGEH (min), best_val_R2 (max)
  plus the eval step index at which each best occurs.
- Also saves CSV: logs/spatial_cv_region_best_metrics.csv
"""

import re
from pathlib import Path
import pandas as pd

LOG_DIR = Path("logs")
LOG_TEMPLATE = "spatial_cv_{i}.log"   # i = 1..9
OUT_CSV = LOG_DIR / "spatial_cv_region_best_metrics.csv"

REGION_CODE_BY_I = {
    1: "E12000001",
    2: "E12000002",
    3: "E12000003",
    4: "E12000004",
    5: "E12000005",
    6: "E12000006",
    7: "E12000007",
    8: "E12000008",
    9: "E12000009",
}

# Standard ONS English region names for E12000001..E12000009
REGION_NAME_BY_CODE = {
    "E12000001": "North East",
    "E12000002": "North West",
    "E12000003": "Yorkshire and the Humber",
    "E12000004": "East Midlands",
    "E12000005": "West Midlands",
    "E12000006": "East of England",
    "E12000007": "London",
    "E12000008": "South East",
    "E12000009": "South West",
}

# robust float pattern: 1.23, -4., .5, 1e-3, -2.1E+05
FLOAT = r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?"

pattern_val = re.compile(
    rf"Epoch\s+(?P<epoch>\d+)\s+Step\s+(?P<step>\d+)\s+Validation\s+Eval:\s*"
    rf"RMSE:\s*(?P<rmse>{FLOAT}),\s*MAE:\s*(?P<mae>{FLOAT}),\s*MGEH:\s*(?P<mgeh>{FLOAT}),\s*R2:\s*(?P<r2>{FLOAT})"
    rf"(?:\s*\(stage\s+(?P<stage_cur>\d+)\s*/\s*(?P<stage_total>\d+)\s*,\s*lr\s*=\s*(?P<lr>{FLOAT})\s*\))?",
    re.IGNORECASE,
)

def parse_val_records(text: str):
    rows = []
    v_idx = 0
    for m in pattern_val.finditer(text):
        v_idx += 1
        g = m.groupdict()
        rows.append({
            "eval_idx": v_idx,  # sequential validation-eval counter
            "epoch": int(g["epoch"]),
            "step": int(g["step"]),
            "rmse": float(g["rmse"]),
            "mae": float(g["mae"]),
            "mgeh": float(g["mgeh"]),
            "r2": float(g["r2"]),
            "stage": int(g["stage_cur"]) if g["stage_cur"] is not None else None,
            "lr": float(g["lr"]) if g["lr"] is not None else None,
        })
    return pd.DataFrame(rows)

def best_min(df: pd.DataFrame, col: str):
    if df.empty:
        return None, None
    i = df[col].astype(float).idxmin()
    return float(df.loc[i, col]), int(df.loc[i, "eval_idx"])

def best_max(df: pd.DataFrame, col: str):
    if df.empty:
        return None, None
    i = df[col].astype(float).idxmax()
    return float(df.loc[i, col]), int(df.loc[i, "eval_idx"])

summaries = []

for i in range(1, 10):
    code = REGION_CODE_BY_I[i]
    region = REGION_NAME_BY_CODE.get(code, code)
    log_path = LOG_DIR / LOG_TEMPLATE.format(i=i)
    if not log_path.exists():
        print(f"[WARN] Missing log: {log_path}")
        summaries.append({
            "region": region,
            "best_val_RMSE": None, "best_val_RMSE_eval": None,
            "best_val_MAE": None,  "best_val_MAE_eval": None,
            "best_val_MGEH": None, "best_val_MGEH_eval": None,
            "best_val_R2": None,   "best_val_R2_eval": None,
        })
        continue

    text = log_path.read_text(encoding="utf-8", errors="ignore")
    dfv = parse_val_records(text)

    rmse_best, rmse_at = best_min(dfv, "rmse")
    mae_best,  mae_at  = best_min(dfv, "mae")
    mgeh_best, mgeh_at = best_min(dfv, "mgeh")
    r2_best,   r2_at   = best_max(dfv, "r2")

    summaries.append({
        "region": region,
        "best_val_RMSE": rmse_best, "best_val_RMSE_eval": rmse_at,
        "best_val_MAE": mae_best,   "best_val_MAE_eval": mae_at,
        "best_val_MGEH": mgeh_best, "best_val_MGEH_eval": mgeh_at,
        "best_val_R2": r2_best,     "best_val_R2_eval": r2_at,
        "n_val_evals": int(dfv.shape[0]),
    })

out = pd.DataFrame(summaries).sort_values("region").reset_index(drop=True)

# pretty print
with pd.option_context("display.max_columns", 200, "display.width", 200):
    print(out)

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_CSV, index=False)
print(f"\nSaved: {OUT_CSV}")

                     region  best_val_RMSE  best_val_RMSE_eval  best_val_MAE  best_val_MAE_eval  best_val_MGEH  best_val_MGEH_eval  best_val_R2  best_val_R2_eval  n_val_evals
0             East Midlands    9807.255859                  35   6966.020996                 35      51.606220                  35     0.650520                35           55
1           East of England   10803.422852                  13   7620.530762                 33      55.697933                  33     0.728260                13           53
2                    London   13482.264648                  31   9559.274414                 32      61.599014                  32     0.714243                31           52
3                North East    7944.389648                  12   6212.567871                 12      55.935581                  12     0.474198                12           32
4                North West   12035.644531                  23   8820.658203                 34      58.136616               

In [17]:
#!/usr/bin/env python3
import re
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, BoundaryNorm
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec

# =========================
# Inputs
# =========================
SHP = Path(r"../data/node_features/boundaries/Regions_December_2022_Boundaries_EN_BFC_V2/RGN_DEC_2022_EN_BFC_V2.shp")
LOG_DIR = Path("logs")
LOG_TPL = "spatial_cv_{}.log"  # 1..9

RANDOM_CV_AVG_MGEH = 54.3

OUT_DIR = Path("figures")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_MAP_PDF   = OUT_DIR / "spatial_cv_mgeh_region_map.pdf"
OUT_BAR_PDF   = OUT_DIR / "spatial_cv_mgeh_region_bar.svg"
OUT_LEG_PDF   = OUT_DIR / "spatial_cv_mgeh_legend.svg"

# =========================
# Region mapping (1..9)
# =========================
REGION_CODES = [
    "E12000001", "E12000002", "E12000003", "E12000004", "E12000005",
    "E12000006", "E12000007", "E12000008", "E12000009"
]
REGION_NAMES = {
    "E12000001": "North East",
    "E12000002": "North West",
    "E12000003": "Yorkshire and The Humber",
    "E12000004": "East Midlands",
    "E12000005": "West Midlands",
    "E12000006": "East of England",
    "E12000007": "London",
    "E12000008": "South East",
    "E12000009": "South West",
}

# =========================
# Parse best validation metrics per region
# =========================
FLOAT = r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?"
pat_val = re.compile(
    rf"Validation\s+Eval:\s*RMSE:\s*(?P<rmse>{FLOAT}),\s*MAE:\s*(?P<mae>{FLOAT}),\s*MGEH:\s*(?P<mgeh>{FLOAT}),\s*R2:\s*(?P<r2>{FLOAT})",
    re.IGNORECASE
)

rows = []
for i, code in enumerate(REGION_CODES, start=1):
    p = LOG_DIR / LOG_TPL.format(i)
    if not p.exists():
        raise FileNotFoundError(f"Missing log: {p}")

    txt = p.read_text(encoding="utf-8", errors="ignore")
    rmse, mae, mgeh, r2 = [], [], [], []

    for m in pat_val.finditer(txt):
        g = m.groupdict()
        rmse.append(float(g["rmse"]))
        mae.append(float(g["mae"]))
        mgeh.append(float(g["mgeh"]))
        r2.append(float(g["r2"]))

    if not mgeh:
        raise RuntimeError(f"No Validation Eval lines found in {p}")

    # best = min MGEH
    best_idx = int(np.nanargmin(mgeh))
    rows.append({
        "RGN_CODE": code,
        "Region": REGION_NAMES.get(code, code),
        "best_val_MGEH": float(mgeh[best_idx]),
        "best_val_MAE": float(mae[best_idx]),
        "best_val_RMSE": float(rmse[best_idx]),
        "best_val_R2": float(r2[best_idx]),
    })

df = pd.DataFrame(rows)

# =========================
# Load regions shapefile + join
# =========================
gdf = gpd.read_file(SHP)

# Try to locate region code field
code_cols = [c for c in gdf.columns if c.upper().endswith("CD") and "RGN" in c.upper()]
if not code_cols:
    for c in gdf.columns:
        if gdf[c].astype(str).str.startswith("E1200000").any():
            code_cols = [c]
            break
if not code_cols:
    raise ValueError("Could not find a region code column in the shapefile (expected something like RGN22CD).")

code_col = code_cols[0]
gdf[code_col] = gdf[code_col].astype(str)

g = gdf.merge(df, left_on=code_col, right_on="RGN_CODE", how="inner")
if g.empty:
    raise RuntimeError("Join produced 0 rows. Check region code column and mappings.")

# =========================
# Discrete bins + palette
# =========================
# bins: 0-50, 50-52, 52-56, 56-58, 58-59, 59+
bounds = [48, 50, 52, 56, 58, 59, 62]  # last is just to cap display
colors = ["#ffffb2", "#fed976", "#feb24c", "#fd8d3c", "#f03b20", "#bd0026"]

cmap = ListedColormap(colors, name="mgeh_bins")
norm = BoundaryNorm(bounds, cmap.N)

def mgeh_bin_labels():
    return ["0–50", "50–52", "52–56", "56–58", "58–59", "59+"]

def color_for_value(v):
    idx = np.digitize([v], bounds)[0] - 1
    idx = max(0, min(idx, len(colors) - 1))
    return colors[idx]

# =========================
# (A) Save legend-only PDF
# =========================
fig = plt.figure(figsize=(0.5, 3))
ax = fig.add_subplot(111)

ax.set_xlim(0, 1)
ax.set_ylim(bounds[0], bounds[-2])
ax.set_xticks([])
ax.set_yticks([])

# stacked rectangles
for (y0, y1, col) in zip(bounds[:-2], bounds[1:-1], colors[:-1]):
    ax.add_patch(mpatches.Rectangle((0.20, y0), 0.38, y1 - y0, facecolor=col, edgecolor="none"))
# last bin (59+)
ax.add_patch(mpatches.Rectangle((0.20, 59), 0.38, (bounds[-2] - 59), facecolor=colors[-1], edgecolor="none"))

# labels
# labels = mgeh_bin_labels()
# y_mids = [(bounds[i] + bounds[i+1]) / 2 for i in range(len(bounds)-2)]
# y_mids[-1] = (59 + bounds[-2]) / 2
# for lab, ym in zip(labels, y_mids):
#     ax.text(0.62, ym, lab, va="center", ha="left", fontsize=8)

# reference line
ax.axhline(RANDOM_CV_AVG_MGEH, color="gray", lw=1.0)
# ax.text(0.20, RANDOM_CV_AVG_MGEH, "Random CV avg", va="bottom", ha="left", fontsize=8, color="gray")

for sp in ax.spines.values():
    sp.set_visible(False)

fig.savefig(OUT_LEG_PDF, bbox_inches="tight", transparent=True)
plt.close(fig)

# =========================
# (B) Save map-only PDF
# =========================
fig = plt.figure(figsize=(10, 10.0))
ax_map = fig.add_subplot(111)

g.plot(
    ax=ax_map,
    column="best_val_MGEH",
    cmap=cmap,
    norm=norm,
    linewidth=0.6,
    edgecolor="white",
)
ax_map.set_axis_off()

fig.savefig(OUT_MAP_PDF, bbox_inches="tight", transparent=True)
plt.close(fig)

# =========================
# (C) Save bar-only PDF
# =========================
df_bar = df.sort_values("best_val_MGEH", ascending=False).reset_index(drop=True)
ypos = np.arange(len(df_bar))
bar_colors = [color_for_value(v) for v in df_bar["best_val_MGEH"].to_numpy()]

fig = plt.figure(figsize=(4.5, 4))
ax_bar = fig.add_subplot(111)

ax_bar.barh(ypos, df_bar["best_val_MGEH"].to_numpy(), color=bar_colors, edgecolor="none")
ax_bar.set_yticks(ypos)
ax_bar.set_yticklabels(df_bar["Region"].tolist())
ax_bar.invert_yaxis()

ax_bar.set_xlim(40, 70)
ax_bar.axvline(RANDOM_CV_AVG_MGEH, linestyle=":", lw=2, color='gray')

# remove top/right spines
ax_bar.spines["top"].set_visible(False)
ax_bar.spines["right"].set_visible(False)

# no title
ax_bar.set_xlabel("Best test MGEH")

# annotate values
for y, v in zip(ypos, df_bar["best_val_MGEH"].to_numpy()):
    ax_bar.text(min(v + 0.3, 69.5), y, f"{v:.1f}", va="center", ha="left", fontsize=9)

fig.tight_layout()
fig.savefig(OUT_BAR_PDF, bbox_inches="tight")
plt.close(fig)

print("Saved:")
print(" -", OUT_LEG_PDF)
print(" -", OUT_MAP_PDF)
print(" -", OUT_BAR_PDF)
print(df[["Region", "best_val_MGEH", "best_val_MAE", "best_val_RMSE", "best_val_R2"]])

Saved:
 - figures\spatial_cv_mgeh_legend.svg
 - figures\spatial_cv_mgeh_region_map.pdf
 - figures\spatial_cv_mgeh_region_bar.svg
                     Region  best_val_MGEH  best_val_MAE  best_val_RMSE  \
0                North East      55.935581   6212.567871    7944.389648   
1                North West      58.136616   8820.658203   12115.383789   
2  Yorkshire and The Humber      48.166660   6091.789551    8075.288574   
3             East Midlands      51.606220   6966.020996    9807.255859   
4             West Midlands      57.186520   8125.119141   10786.467773   
5           East of England      55.697933   7620.530762   10865.658203   
6                    London      61.599014   9559.274414   13509.865234   
7                South East      57.820438   8368.140625   11110.619141   
8                South West      61.694195   7290.414551    9104.548828   

   best_val_R2  
0     0.474198  
1     0.654137  
2     0.767783  
3     0.650520  
4     0.676905  
5     0.725120  
6

In [None]:
import numpy as np
np.
np.std([1,2,3])

0.816496580927726

In [2]:
#!/usr/bin/env python3
"""
Summarise best TRAIN and VALIDATION metrics per spatial-CV region
and compute spatial-CV mean/std for DeepDemand.

Outputs
-------
1) logs/spatial_cv_region_best_metrics_test.csv
2) logs/spatial_cv_region_best_metrics_train.csv
3) logs/DeepDemand_spatial_cv_summary.csv
"""

import re
from pathlib import Path
import pandas as pd
import numpy as np

LOG_DIR = Path("logs")
LOG_TEMPLATE = "spatial_cv_{i}.log"
OUT_TEST = LOG_DIR / "spatial_cv_region_best_metrics_test.csv"
OUT_TRAIN = LOG_DIR / "spatial_cv_region_best_metrics_train.csv"
OUT_SUMMARY = LOG_DIR / "DeepDemand_spatial_cv_summary.csv"

MODEL_NAME = "DeepDemand"

REGION_CODE_BY_I = {
    1: "E12000001",
    2: "E12000002",
    3: "E12000003",
    4: "E12000004",
    5: "E12000005",
    6: "E12000006",
    7: "E12000007",
    8: "E12000008",
    9: "E12000009",
}

REGION_NAME_BY_CODE = {
    "E12000001": "North East",
    "E12000002": "North West",
    "E12000003": "Yorkshire and the Humber",
    "E12000004": "East Midlands",
    "E12000005": "West Midlands",
    "E12000006": "East of England",
    "E12000007": "London",
    "E12000008": "South East",
    "E12000009": "South West",
}

FLOAT = r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?"

# Validation pattern
pattern_val = re.compile(
    rf"Epoch\s+(?P<epoch>\d+)\s+Step\s+(?P<step>\d+)\s+Validation\s+Eval:\s*"
    rf"RMSE:\s*(?P<rmse>{FLOAT}),\s*MAE:\s*(?P<mae>{FLOAT}),\s*MGEH:\s*(?P<mgeh>{FLOAT}),\s*R2:\s*(?P<r2>{FLOAT})",
    re.IGNORECASE,
)

# Training pattern
pattern_train = re.compile(
    rf"Epoch\s+(?P<epoch>\d+)\s+Step\s+(?P<step>\d+)\s+Train\s+Eval:\s*"
    rf"RMSE:\s*(?P<rmse>{FLOAT}),\s*MAE:\s*(?P<mae>{FLOAT}),\s*MGEH:\s*(?P<mgeh>{FLOAT}),\s*R2:\s*(?P<r2>{FLOAT})",
    re.IGNORECASE,
)

def parse_records(text: str, pattern):
    rows = []
    idx = 0
    for m in pattern.finditer(text):
        idx += 1
        g = m.groupdict()
        rows.append({
            "eval_idx": idx,
            "epoch": int(g["epoch"]),
            "step": int(g["step"]),
            "rmse": float(g["rmse"]),
            "mae": float(g["mae"]),
            "mgeh": float(g["mgeh"]),
            "r2": float(g["r2"]),
        })
    return pd.DataFrame(rows)

def best_min(df, col):
    if df.empty:
        return None, None
    i = df[col].idxmin()
    return float(df.loc[i, col]), int(df.loc[i, "eval_idx"])

def best_max(df, col):
    if df.empty:
        return None, None
    i = df[col].idxmax()
    return float(df.loc[i, col]), int(df.loc[i, "eval_idx"])

test_summaries = []
train_summaries = []

for i in range(1, 10):
    code = REGION_CODE_BY_I[i]
    region = REGION_NAME_BY_CODE.get(code, code)
    log_path = LOG_DIR / LOG_TEMPLATE.format(i=i)

    if not log_path.exists():
        print(f"[WARN] Missing log: {log_path}")
        continue

    text = log_path.read_text(encoding="utf-8", errors="ignore")

    df_test = parse_records(text, pattern_val)
    df_train = parse_records(text, pattern_train)

    # ---- TEST (validation fold) ----
    rmse_best, _ = best_min(df_test, "rmse")
    mae_best, _  = best_min(df_test, "mae")
    mgeh_best, _ = best_min(df_test, "mgeh")
    r2_best, _   = best_max(df_test, "r2")

    test_summaries.append({
        "region": region,
        "RMSE": rmse_best,
        "MAE": mae_best,
        "MGEH": mgeh_best,
        "R2": r2_best,
    })

    # ---- TRAIN ----
    rmse_best_t, _ = best_min(df_train, "rmse")
    mae_best_t, _  = best_min(df_train, "mae")
    mgeh_best_t, _ = best_min(df_train, "mgeh")
    r2_best_t, _   = best_max(df_train, "r2")

    train_summaries.append({
        "region": region,
        "RMSE": rmse_best_t,
        "MAE": mae_best_t,
        "MGEH": mgeh_best_t,
        "R2": r2_best_t,
    })

# Convert to DataFrames
df_test = pd.DataFrame(test_summaries).sort_values("region").reset_index(drop=True)
df_train = pd.DataFrame(train_summaries).sort_values("region").reset_index(drop=True)

df_test.to_csv(OUT_TEST, index=False)
df_train.to_csv(OUT_TRAIN, index=False)

print(f"Saved: {OUT_TEST}")
print(f"Saved: {OUT_TRAIN}")

# --------------------------
# Spatial CV summary (mean ± std)
# --------------------------

summary_rows = []

for split_name, df in [("Train", df_train), ("Test", df_test)]:
    for metric in ["RMSE", "MAE", "MGEH", "R2"]:
        mean_val = df[metric].mean()
        std_val  = df[metric].std(ddof=1)

        summary_rows.append({
            "model": MODEL_NAME,
            "split": split_name,
            "metric": metric,
            "mean": mean_val,
            "std": std_val,
            "mean_std": f"{mean_val:.4f} ± {std_val:.4f}"
        })

df_summary = pd.DataFrame(summary_rows)
df_summary.to_csv(OUT_SUMMARY, index=False)

print(f"Saved: {OUT_SUMMARY}")

Saved: logs\spatial_cv_region_best_metrics_test.csv
Saved: logs\spatial_cv_region_best_metrics_train.csv
Saved: logs\DeepDemand_spatial_cv_summary.csv
