In [1]:
#!/usr/bin/env python3
"""
Summarise *best* validation metrics per standard 5-fold CV from logs.

Assumptions
-----------
- You have 5 log files:
    logs/cv_0.log ... logs/cv_4.log
- Validation lines look like:
    Epoch X Step Y Validation Eval: RMSE: a, MAE: b, MGEH: c, R2: d (optional stage/lr)

Output
------
- Prints:
  (1) per-fold best metrics:
      best_val_RMSE (min), best_val_MAE (min), best_val_MGEH (min), best_val_R2 (max)
      and the eval index where each best occurs
  (2) mean±sd of those best metrics across folds
- Saves CSV: logs/cv_fold_best_metrics.csv
"""

import re
from pathlib import Path
import numpy as np
import pandas as pd

LOG_DIR = Path("logs")
LOG_TEMPLATE = "cv_{k}.log"   # k = 0..4
OUT_CSV = LOG_DIR / "cv_fold_best_metrics.csv"

# robust float pattern: 1.23, -4., .5, 1e-3, -2.1E+05
FLOAT = r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?"

pattern_val = re.compile(
    rf"Epoch\s+(?P<epoch>\d+)\s+Step\s+(?P<step>\d+)\s+Validation\s+Eval:\s*"
    rf"RMSE:\s*(?P<rmse>{FLOAT}),\s*MAE:\s*(?P<mae>{FLOAT}),\s*MGEH:\s*(?P<mgeh>{FLOAT}),\s*R2:\s*(?P<r2>{FLOAT})"
    rf"(?:\s*\(stage\s+(?P<stage_cur>\d+)\s*/\s*(?P<stage_total>\d+)\s*,\s*lr\s*=\s*(?P<lr>{FLOAT})\s*\))?",
    re.IGNORECASE,
)

def parse_val_records(text: str) -> pd.DataFrame:
    rows = []
    v_idx = 0
    for m in pattern_val.finditer(text):
        v_idx += 1
        g = m.groupdict()
        rows.append({
            "eval_idx": v_idx,  # sequential validation-eval counter
            "epoch": int(g["epoch"]),
            "step": int(g["step"]),
            "rmse": float(g["rmse"]),
            "mae": float(g["mae"]),
            "mgeh": float(g["mgeh"]),
            "r2": float(g["r2"]),
            "stage": int(g["stage_cur"]) if g["stage_cur"] is not None else None,
            "lr": float(g["lr"]) if g["lr"] is not None else None,
        })
    return pd.DataFrame(rows)

def best_min(df: pd.DataFrame, col: str):
    if df.empty:
        return None, None
    i = df[col].astype(float).idxmin()
    return float(df.loc[i, col]), int(df.loc[i, "eval_idx"])

def best_max(df: pd.DataFrame, col: str):
    if df.empty:
        return None, None
    i = df[col].astype(float).idxmax()
    return float(df.loc[i, col]), int(df.loc[i, "eval_idx"])

def mean_sd(vals):
    vals = np.asarray([v for v in vals if v is not None and np.isfinite(v)], dtype=float)
    if vals.size == 0:
        return np.nan, np.nan, 0
    return float(vals.mean()), float(vals.std(ddof=1) if vals.size > 1 else 0.0), int(vals.size)

summaries = []
for k in range(5):
    log_path = LOG_DIR / LOG_TEMPLATE.format(k=k)
    if not log_path.exists():
        print(f"[WARN] Missing log: {log_path}")
        summaries.append({
            "fold": k,
            "best_val_RMSE": None, "best_val_RMSE_eval": None,
            "best_val_MAE": None,  "best_val_MAE_eval": None,
            "best_val_MGEH": None, "best_val_MGEH_eval": None,
            "best_val_R2": None,   "best_val_R2_eval": None,
            "n_val_evals": 0,
        })
        continue

    text = log_path.read_text(encoding="utf-8", errors="ignore")
    dfv = parse_val_records(text)
    if dfv.empty:
        print(f"[WARN] No Validation Eval lines found in {log_path}")
        summaries.append({
            "fold": k,
            "best_val_RMSE": None, "best_val_RMSE_eval": None,
            "best_val_MAE": None,  "best_val_MAE_eval": None,
            "best_val_MGEH": None, "best_val_MGEH_eval": None,
            "best_val_R2": None,   "best_val_R2_eval": None,
            "n_val_evals": 0,
        })
        continue

    rmse_best, rmse_at = best_min(dfv, "rmse")
    mae_best,  mae_at  = best_min(dfv, "mae")
    mgeh_best, mgeh_at = best_min(dfv, "mgeh")
    r2_best,   r2_at   = best_max(dfv, "r2")

    summaries.append({
        "fold": k,
        "best_val_RMSE": rmse_best, "best_val_RMSE_eval": rmse_at,
        "best_val_MAE": mae_best,   "best_val_MAE_eval": mae_at,
        "best_val_MGEH": mgeh_best, "best_val_MGEH_eval": mgeh_at,
        "best_val_R2": r2_best,     "best_val_R2_eval": r2_at,
        "n_val_evals": int(dfv.shape[0]),
    })

per_fold = pd.DataFrame(summaries).sort_values("fold").reset_index(drop=True)

# means across folds (of the per-fold bests)
rmse_mu, rmse_sd, n_rmse = mean_sd(per_fold["best_val_RMSE"].tolist())
mae_mu,  mae_sd,  n_mae  = mean_sd(per_fold["best_val_MAE"].tolist())
mgeh_mu, mgeh_sd, n_mgeh = mean_sd(per_fold["best_val_MGEH"].tolist())
r2_mu,   r2_sd,   n_r2   = mean_sd(per_fold["best_val_R2"].tolist())

summary = pd.DataFrame([{
    "fold": "MEAN±SD",
    "best_val_RMSE": f"{rmse_mu:.6g} ± {rmse_sd:.6g}",
    "best_val_MAE":  f"{mae_mu:.6g} ± {mae_sd:.6g}",
    "best_val_MGEH": f"{mgeh_mu:.6g} ± {mgeh_sd:.6g}",
    "best_val_R2":   f"{r2_mu:.6g} ± {r2_sd:.6g}",
    "n_folds_used": min(n_rmse, n_mae, n_mgeh, n_r2),
}])

# print
with pd.option_context("display.max_columns", 200, "display.width", 220):
    print(per_fold)
    print("\nAcross-fold mean±sd of per-fold best validation metrics:")
    print(summary)

# save per-fold CSV
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
per_fold.to_csv(OUT_CSV, index=False)
print(f"\nSaved: {OUT_CSV}")

   fold  best_val_RMSE  best_val_RMSE_eval  best_val_MAE  best_val_MAE_eval  best_val_MGEH  best_val_MGEH_eval  best_val_R2  best_val_R2_eval  n_val_evals
0     0   10164.191406                  28   7410.648438                 28      55.295536                  28     0.703502                25           48
1     1    9635.133789                  34   7167.251953                 63      52.688358                  58     0.748526                34           78
2     2   10259.435547                  14   7615.572266                 14      55.455898                  36     0.701388                 6           56
3     3   10127.718750                  32   7483.710938                 26      54.263935                  26     0.711685                45           46
4     4    9810.294922                  22   7354.085938                 22      53.826347                  31     0.722578                25           51

Across-fold mean±sd of per-fold best validation metrics:
      fold  

In [3]:
#!/usr/bin/env python3
"""
Summarise *best* TRAIN metrics per standard 5-fold CV from logs,
and report mean ± SD across folds (based on the per-fold best values).

Assumptions
-----------
- Logs: logs/cv_0.log ... logs/cv_4.log
- Train lines look like:
    Epoch X Step Y Train Eval: RMSE: a, MAE: b, MGEH: c, R2: d

Output
------
- Table: one row per fold with best Train metrics + eval_idx where best occurs
- Summary: mean ± sd of best metrics across folds (RMSE/MAE/MGEH: lower is better; R2: higher is better)
- CSV: logs/cv_fold_best_train_metrics.csv
"""

import re
from pathlib import Path
import pandas as pd
import numpy as np

# -------------------------
# Config
# -------------------------
LOG_DIR = Path("logs")
LOG_TEMPLATE = "cv_{k}.log"  # k=0..4
OUT_CSV = LOG_DIR / "cv_fold_best_train_metrics.csv"

# robust float pattern: 1.23, -4., .5, 1e-3, -2.1E+05
FLOAT = r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?"

pattern_train = re.compile(
    rf"Epoch\s+(?P<epoch>\d+)\s+Step\s+(?P<step>\d+)\s+Train\s+Eval:\s*"
    rf"RMSE:\s*(?P<rmse>{FLOAT}),\s*MAE:\s*(?P<mae>{FLOAT}),\s*MGEH:\s*(?P<mgeh>{FLOAT}),\s*R2:\s*(?P<r2>{FLOAT})",
    re.IGNORECASE,
)

def parse_train_records(text: str) -> pd.DataFrame:
    rows = []
    idx = 0
    for m in pattern_train.finditer(text):
        idx += 1
        g = m.groupdict()
        rows.append(
            dict(
                eval_idx=idx,
                epoch=int(g["epoch"]),
                step=int(g["step"]),
                rmse=float(g["rmse"]),
                mae=float(g["mae"]),
                mgeh=float(g["mgeh"]),
                r2=float(g["r2"]),
            )
        )
    return pd.DataFrame(rows)

def best_min(df: pd.DataFrame, col: str):
    if df.empty:
        return None, None
    i = df[col].astype(float).idxmin()
    return float(df.loc[i, col]), int(df.loc[i, "eval_idx"])

def best_max(df: pd.DataFrame, col: str):
    if df.empty:
        return None, None
    i = df[col].astype(float).idxmax()
    return float(df.loc[i, col]), int(df.loc[i, "eval_idx"])

def mean_sd(series: pd.Series):
    s = series.dropna().astype(float)
    if s.empty:
        return None, None
    if len(s) == 1:
        return float(s.iloc[0]), 0.0
    return float(s.mean()), float(s.std(ddof=1))

# -------------------------
# Summarise folds
# -------------------------
rows = []
for k in range(5):
    p = LOG_DIR / LOG_TEMPLATE.format(k=k)
    if not p.exists():
        print(f"[WARN] Missing log: {p}")
        rows.append(
            dict(
                fold=k,
                best_train_RMSE=None, best_train_RMSE_eval=None,
                best_train_MAE=None,  best_train_MAE_eval=None,
                best_train_MGEH=None, best_train_MGEH_eval=None,
                best_train_R2=None,   best_train_R2_eval=None,
                n_train_evals=0,
            )
        )
        continue

    txt = p.read_text(encoding="utf-8", errors="ignore")
    dft = parse_train_records(txt)
    if dft.empty:
        print(f"[WARN] No Train Eval lines found in: {p}")
        rows.append(
            dict(
                fold=k,
                best_train_RMSE=None, best_train_RMSE_eval=None,
                best_train_MAE=None,  best_train_MAE_eval=None,
                best_train_MGEH=None, best_train_MGEH_eval=None,
                best_train_R2=None,   best_train_R2_eval=None,
                n_train_evals=0,
            )
        )
        continue

    rmse_best, rmse_at = best_min(dft, "rmse")
    mae_best,  mae_at  = best_min(dft, "mae")
    mgeh_best, mgeh_at = best_min(dft, "mgeh")
    r2_best,   r2_at   = best_max(dft, "r2")

    rows.append(
        dict(
            fold=k,
            best_train_RMSE=rmse_best, best_train_RMSE_eval=rmse_at,
            best_train_MAE=mae_best,   best_train_MAE_eval=mae_at,
            best_train_MGEH=mgeh_best, best_train_MGEH_eval=mgeh_at,
            best_train_R2=r2_best,     best_train_R2_eval=r2_at,
            n_train_evals=int(dft.shape[0]),
        )
    )

out = pd.DataFrame(rows).sort_values("fold").reset_index(drop=True)

# -------------------------
# Mean ± SD across folds (based on per-fold best values)
# -------------------------
rmse_mean, rmse_sd = mean_sd(out["best_train_RMSE"])
mae_mean,  mae_sd  = mean_sd(out["best_train_MAE"])
mgeh_mean, mgeh_sd = mean_sd(out["best_train_MGEH"])
r2_mean,   r2_sd   = mean_sd(out["best_train_R2"])

summary_row = dict(
    fold="MEAN±SD",
    best_train_RMSE=f"{rmse_mean:.6g} ± {rmse_sd:.6g}" if rmse_mean is not None else None,
    best_train_MAE=f"{mae_mean:.6g} ± {mae_sd:.6g}" if mae_mean is not None else None,
    best_train_MGEH=f"{mgeh_mean:.6g} ± {mgeh_sd:.6g}" if mgeh_mean is not None else None,
    best_train_R2=f"{r2_mean:.6g} ± {r2_sd:.6g}" if r2_mean is not None else None,
    best_train_RMSE_eval=None,
    best_train_MAE_eval=None,
    best_train_MGEH_eval=None,
    best_train_R2_eval=None,
    n_train_evals=int(out["n_train_evals"].sum()),
)

out_with_summary = pd.concat([out, pd.DataFrame([summary_row])], ignore_index=True)

# pretty print
with pd.option_context("display.max_columns", 200, "display.width", 220):
    print(out_with_summary)

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
out_with_summary.to_csv(OUT_CSV, index=False)
print(f"\nSaved: {OUT_CSV}")

print("\nMean ± SD of per-fold best metrics:")
print(f"  RMSE: {rmse_mean:.6g} ± {rmse_sd:.6g}" if rmse_mean is not None else "  RMSE: NA")
print(f"  MAE : {mae_mean:.6g} ± {mae_sd:.6g}" if mae_mean is not None else "  MAE : NA")
print(f"  MGEH: {mgeh_mean:.6g} ± {mgeh_sd:.6g}" if mgeh_mean is not None else "  MGEH: NA")
print(f"  R2  : {r2_mean:.6g} ± {r2_sd:.6g}" if r2_mean is not None else "  R2  : NA")

      fold    best_train_RMSE best_train_RMSE_eval     best_train_MAE best_train_MAE_eval     best_train_MGEH best_train_MGEH_eval          best_train_R2 best_train_R2_eval  n_train_evals
0        0        9402.484375                   39        6887.747559                  39           50.377548                   39               0.744142                 11             48
1        1        9432.814453                   39        6884.439941                  39           49.904266                   39               0.734815                 39             78
2        2        9465.344727                   13        6723.525879                  40            49.33709                   23               0.740983                 22             56
3        3        9567.884766                   12        6874.165527                  34           50.597298                   34               0.751773                 12             46
4        4        9605.004883                   36        68