In [1]:
import json
import re
from pathlib import Path
from collections import defaultdict

import pandas as pd
import numpy as np




In [2]:
def safe_load_json(path):
    try:
        with open(path, "r") as f:
            return json.load(f)
    except Exception:
        return None


PARAM_PATTERNS = {
    # Dataset params
    "a1": r"a1([0-9\.]+)",
    "a10": r"a10([0-9\.]+)",
    "rho": r"rho([0-9\.]+)",
    "gamma": r"gamma([0-9\.]+)",
    "noise": r"noise([0-9\.]+)",
    "n": r"n([0-9]+)",
    "d": r"d([0-9]+)",
    "L": r"L([0-9]+)",
    "w": r"w([0-9]+)",

    # Model params
    "dm": r"dm([0-9]+)",
    "h": r"h([0-9]+)",
    "ly": r"ly([0-9]+)",

    # Training
    "bs": r"bs([0-9]+)",
    "ep": r"ep([0-9]+)",
    "lr": r"lr([0-9\.e-]+)",

    # Experiment
    "nperm": r"nperm([0-9]+)",
    "taumax": r"taumax([0-9]+)",
}


def extract_model_name(run_dir: Path):
    """
    Assumes structure:
    runs/<dataset>/<model_dir>/...
    where model_dir starts with model name
    """
    for part in run_dir.parts:
        if part.startswith(("transformer", "lstm", "tcn")):
            return part.split("_")[0]
    return None



def extract_params_from_path(path: Path):
    params = {}
    text = "_".join(path.parts)

    for key, pattern in PARAM_PATTERNS.items():
        m = re.search(pattern, text)
        if m:
            params[key] = m.group(1)

    return params



In [21]:
# notebooks/utils_run_parsing.py
from pathlib import Path
import re
def tokenize(name: str):
    """Split dir name into atomic tokens."""
    return name.replace("/", "").split("_")

def get_dataset_params(run_dir: Path):
    """
    Extract dataset name and params from first-level folder.
    """
    dataset_dir = run_dir.parts[-4]  # runs/<DATASET>/...
    tokens = tokenize(dataset_dir)

    out = {
        "dataset": None,
        "a1": None,
        "rho": None,
        "gamma": None,
        "noise": None,
        "num_samples": None,
        "num_series": None,
        "seq_len": None,
    }

    # dataset name = first token
    out["dataset"] = tokens[0]

    for t in tokens[1:]:
        if t.startswith("a1"):
            out["a1"] = float(t[2:])
        elif t.startswith("rho"):
            out["rho"] = float(t[3:])
        elif t.startswith("gamma"):
            out["gamma"] = float(t[5:])
        elif t.startswith("noise"):
            out["noise"] = float(t[5:])
        elif t.startswith("n"):
            out["num_samples"] = int(t[1:])
        elif t.startswith("d"):
            out["num_series"] = int(t[1:])
        elif t.startswith("L"):
            out["seq_len"] = int(t[1:])

    return out


def get_model_params(run_dir: Path):
    """
    Extract model name + params from model-level dir.
    """
    model_dir = run_dir.parts[-3]
    tokens = tokenize(model_dir)

    out = {
        "model_name": tokens[0],
        "d_model": None,
        "hidden": None,
        "layers": None,
    }

    for t in tokens[1:]:
        if t.startswith("dm"):
            out["d_model"] = int(t[2:])
        elif t.startswith("h"):
            out["hidden"] = int(t[1:])
        elif t.startswith("ly"):
            out["layers"] = int(t[2:])

    return out


def get_training_params(run_dir: Path):
    train_dir = run_dir.parts[-2]
    tokens = tokenize(train_dir)

    out = {
        "batch_size": None,
        "epochs": None,
        "lr": None,
    }

    for t in tokens:
        if t.startswith("bs"):
            out["batch_size"] = int(t[2:])
        elif t.startswith("ep"):
            out["epochs"] = int(t[2:])
        elif t.startswith("lr"):
            out["lr"] = float(t[2:])

    return out


def get_pairwise_xai_params(run_dir: Path):
    exp_dir = run_dir.parts[-1]
    tokens = tokenize(exp_dir)

    out = {
        "interaction_method": None,
        "num_permutations": None,
        "tau_max": None,
        "baseline": None,
    }

    for t in tokens:
        if "interaction_method" in t:
            out["interaction_method"] = t.split(".")[-1]
        elif "num_permutations" in t:
            out["num_permutations"] = int(re.findall(r"\d+", t)[0])
        elif t.startswith("taumax"):
            out["tau_max"] = int(t.replace("taumax", ""))
        elif "baseline" in t:
            out["baseline"] = t.replace("baseline", "")

    return out





In [22]:
def get_dataset_name(run_dir: Path):
    try:
        return run_dir.parts[2]   # runs/<dataset>/...
    except Exception:
        return "unknown"


In [27]:
RUNS_DIR = Path("../runs")

records = []

for hist_path in RUNS_DIR.rglob("history.json"):
    run_dir = hist_path.parent

    history = safe_load_json(hist_path)
    if not history or not isinstance(history, list):
        continue

    final = history[-1]  # last epoch only

    dataset = get_dataset_name(run_dir)
    params = extract_params_from_path(run_dir)
    model_name = extract_model_name(run_dir)

    record = {
        "dataset": dataset,
        # "dataset_params": get_dataset_params(run_dir),
        "model_name": model_name,
        "runs_dir": run_dir,
        "model_params": get_model_params(run_dir),
        # "pointwise_xai": get_pointwise_xai_params(run_dir),
        "pairwise_xai": get_pairwise_xai_params(run_dir),
        "epoch": final.get("epoch"),
        "precision": final.get("precision"),
        "recall": final.get("recall"),
        "f1": final.get("f1"),
        "auroc": final.get("auroc"),
        "auprc": final.get("auprc"),
        "train_loss": final.get("train_loss"),
        "val_loss": final.get("val_loss"),
    }

    record.update(params)
    records.append(record)


df = pd.DataFrame(records)
df.columns



# Ensure numeric columns are numeric
NUM_COLS = [
    "precision", "recall", "f1",
    "auroc", "auprc", "train_loss", "val_loss"
]
for c in NUM_COLS:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df.sort_values(by=["dataset", "auprc"], ascending=[True, False])
df.to_csv("classifier_metrics.tsv", index=False, sep="\t")


Index([], dtype='object')

KeyError: 'precision'

KeyError: 'dataset'

In [11]:
tables = {}

for dataset, ddf in df.groupby("dataset"):
    # print(dataset)
    # Put params first, metrics last
    metric_cols = NUM_COLS
    param_cols = [c for c in ddf.columns if c not in metric_cols + ["dataset", "run_path"]]

    ordered = ddf[param_cols + metric_cols + ["run_path"]]
    tables[dataset] = ordered.sort_values(by=metric_cols, ascending=False)

