Order and summarize training performances of the MLP approach over a grid of hyperparameters.

In [1]:
from pathlib import Path
import yaml

import numpy as np
import pandas as pd

In [2]:
# ------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------
ROOT = Path("..")
RUNS_DIR = ROOT / "multirun/2026-02-05/11-31-57"  # directory containing run_id folders
RUN_IDS = range(0, 252)

HP_KEYS = {
    "batch_size": "batch_size",
    "mlp.hidden_layers_size": "mlp.hidden_layers_size",
    "mlp.add_layer_norm": "mlp.add_layer_norm",
    "trainer_module.optim_str": "trainer_module.optim_str",
    "trainer_module.learning_rate": "trainer_module.learning_rate",
}

METRICS_PATH = "lightning_logs/csv_logs/version_0/metrics.csv"
OVERRIDES_PATH = ".hydra/overrides.yaml"

# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def parse_overrides_yaml(path):
    """
    Parse Hydra overrides.yaml into a flat dict.
    """
    with open(path, "r") as f:
        overrides = yaml.safe_load(f)

    params = {}
    for item in overrides:
        item = item.lstrip("+")  # remove leading +
        if "=" not in item:
            continue
        k, v = item.split("=", 1)
        params[k] = yaml.safe_load(v)  # parse lists, bools, numbers

    return params


def compute_val_overfit(metrics_df):
    """
    Returns:
        min_val
        max_val_after_min
    """
    val_series = metrics_df["val"].dropna().reset_index(drop=True)

    if val_series.empty:
        return np.nan, np.nan

    min_val = val_series.min()
    min_idx = val_series.idxmin()

    # val after minimum (inclusive or exclusive â€“ here exclusive)
    after_min = val_series.iloc[min_idx + 1 :]
    max_after_min = after_min.max() if not after_min.empty else min_val

    return min_val, max_after_min


# ------------------------------------------------------------------
# Collect per-run results
# ------------------------------------------------------------------
rows = []

for run_id in RUN_IDS:
    run_dir = RUNS_DIR / str(run_id)

    metrics_file = run_dir / METRICS_PATH
    overrides_file = run_dir / OVERRIDES_PATH

    if not metrics_file.exists() or not overrides_file.exists():
        continue

    # ---- read metrics
    metrics = pd.read_csv(metrics_file)

    min_loss_epoch = metrics["loss_epoch"].min()
    min_val, max_val_after_min = compute_val_overfit(metrics)

    # ---- read hyperparameters
    overrides = parse_overrides_yaml(overrides_file)

    hp_values = {
        out_key: overrides.get(in_key, None)
        for out_key, in_key in HP_KEYS.items()
    }

    rows.append(
        {
            "run_id": run_id,
            "min_loss_epoch": min_loss_epoch,
            "min_val": min_val,
            "max_val_after_min": max_val_after_min,
            **hp_values,
        }
    )

df_runs = pd.DataFrame(rows)


In [3]:
ranking = (
    df_runs
    .sort_values("min_val")
    .reset_index(drop=True)
)

ranking.head(10)


Unnamed: 0,run_id,min_loss_epoch,min_val,max_val_after_min,batch_size,mlp.hidden_layers_size,mlp.add_layer_norm,trainer_module.optim_str,trainer_module.learning_rate
0,25,0.862226,0.896556,0.899537,16000,"[128, 256, 256, 256, 128, 64]",True,adamw,0.0001
1,176,0.862777,0.896615,0.898729,64000,"[128, 256, 256, 128, 64]",False,adamw,0.0005
2,172,0.859104,0.896648,0.902251,64000,"[128, 256, 256, 128, 64]",False,adam,0.001
3,156,0.848653,0.896667,0.909242,32000,"[256, 512, 512, 512, 256, 128]",False,adamw,0.0005
4,220,0.863081,0.896673,0.899595,64000,"[256, 512, 512, 512, 256, 128]",True,adam,0.0001
5,20,0.862362,0.896703,0.899813,16000,"[128, 256, 256, 256, 128, 64]",True,adam,0.0001
6,56,0.851493,0.89672,0.907851,16000,"[256, 512, 512, 256, 128]",False,adamw,0.0005
7,111,0.85771,0.89672,0.900969,32000,"[128, 256, 256, 256, 128, 64]",False,adam,0.0005
8,188,0.853609,0.896739,0.906832,64000,"[128, 256, 256, 256, 128, 64]",True,adamw,0.005
9,116,0.855557,0.896767,0.902814,32000,"[128, 256, 256, 256, 128, 64]",False,adamw,0.0005


In [5]:
df_runs["mlp.hidden_layers_size"] = df_runs["mlp.hidden_layers_size"].apply(
    lambda x: str(x) if isinstance(x, list) else x
)

In [6]:
metrics_of_interest = [
    "min_loss_epoch",
    "min_val",
    "max_val_after_min",
]

summaries = {}

for hp in HP_KEYS.keys():
    summaries[hp] = df_runs.groupby(hp)[metrics_of_interest].agg(["mean", "var"]).sort_values(("min_val", "mean"))


In [7]:
summaries["batch_size"]

Unnamed: 0_level_0,min_loss_epoch,min_loss_epoch,min_val,min_val,max_val_after_min,max_val_after_min
Unnamed: 0_level_1,mean,var,mean,var,mean,var
batch_size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
64000,0.857727,4.3e-05,0.898,7.047478e-07,0.904162,2.2e-05
128000,0.864845,5.9e-05,0.898377,3.301628e-06,0.90115,4e-06
32000,0.857112,0.000157,0.899117,5.045844e-05,0.906528,6.4e-05
16000,0.855912,0.000173,0.899397,5.007637e-05,0.911995,0.000675


In [8]:
summaries["mlp.hidden_layers_size"]

Unnamed: 0_level_0,min_loss_epoch,min_loss_epoch,min_val,min_val,max_val_after_min,max_val_after_min
Unnamed: 0_level_1,mean,var,mean,var,mean,var
mlp.hidden_layers_size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
"[128, 256, 256, 128, 64]",0.860077,3.9e-05,0.898084,1.192503e-06,0.905193,0.000424
"[128, 256, 256, 256, 128, 64]",0.856181,4.4e-05,0.898093,8.032099e-07,0.90528,2.3e-05
"[256, 512, 512, 256, 128]",0.858287,0.000183,0.899458,6.694767e-05,0.906062,6.9e-05
"[256, 512, 512, 512, 256, 128]",0.854077,0.000229,0.899775,6.596856e-05,0.912905,0.000426


In [9]:
summaries["mlp.add_layer_norm"]

Unnamed: 0_level_0,min_loss_epoch,min_loss_epoch,min_val,min_val,max_val_after_min,max_val_after_min
Unnamed: 0_level_1,mean,var,mean,var,mean,var
mlp.add_layer_norm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
True,0.855893,3.2e-05,0.898058,5.228937e-07,0.904801,2.2e-05
False,0.858787,0.000218,0.899623,6.530922e-05,0.909873,0.000486


In [10]:
summaries["trainer_module.optim_str"]

Unnamed: 0_level_0,min_loss_epoch,min_loss_epoch,min_val,min_val,max_val_after_min,max_val_after_min
Unnamed: 0_level_1,mean,var,mean,var,mean,var
trainer_module.optim_str,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
adam,0.856586,5.8e-05,0.89814,1e-06,0.907867,0.000428
adamw,0.858014,0.00019,0.899502,6.4e-05,0.906637,7.4e-05


In [11]:
summaries["trainer_module.learning_rate"]

Unnamed: 0_level_0,min_loss_epoch,min_loss_epoch,min_val,min_val,max_val_after_min,max_val_after_min
Unnamed: 0_level_1,mean,var,mean,var,mean,var
trainer_module.learning_rate,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0.0001,0.864707,2e-05,0.897465,4.635184e-07,0.89919,1e-06
0.0005,0.85737,3.3e-05,0.897621,9.870676e-07,0.903323,1.4e-05
0.001,0.852087,3.4e-05,0.897868,4.523603e-07,0.907587,3e-05
0.005,0.850889,1.6e-05,0.898817,4.40677e-07,0.910051,1.8e-05
0.01,0.861269,0.000386,0.902359,0.0001463127,0.916373,0.00105
