In [None]:
from pathlib import Path
import json
import re
import pandas as pd


# tarball from email, extracted here
root = Path("../tmp/eval-backups-02-v2")
assert root.exists()

submissions = {k.name.replace("eval02-", ""):k for k in root.glob("eval02-*") if k.is_dir()}
submissions


In [None]:
from transformers import AutoTokenizer
from functools import lru_cache
from tqdm.auto import tqdm

MODEL_ID = "CohereLabs/aya-expanse-8b"

EXCLUSIONS = {
    "aya-expanse-8b-bnb-4bit", # dupe: same as aya-expanse-8b-bnb-4bit-fp4
}

@lru_cache(maxsize=8)
def get_tokenizer(model_name=MODEL_ID):
    return AutoTokenizer.from_pretrained(model_name)

def get_output_token_count(stat_file):
    stats = json.loads(stat_file.read_text())

    if "output_token_count" in stats:
        return stats["output_token_count"]
    out_file = stat_file.parent / (stat_file.name.replace(".stats.json", ""))
    assert out_file.exists(), f"no out file {out_file}"
    tokenizer = get_tokenizer()

    tok_count = 0
    with out_file.open() as lines:
        lines = [line for line in lines if line.strip()]
        for line in tqdm(lines, desc=f"tokenizing {out_file.stem}", unit=" lines"):
            line = line.strip()
            if not line:
                continue
            tokenized = tokenizer(line, return_tensors="pt")
            tok_count += tokenized["input_ids"].size(1)

    stats["output_token_count"] = tok_count
    stat_file.write_text(json.dumps(stats, indent=2))

    return stats["output_token_count"]

all_stats = []
for sub_name, sub_dir in submissions.items():
    stat_files = list(sub_dir.glob("*/*.stats.json"))
    assert stat_files, "no stat files"
    for stat_file in stat_files:
        name = stat_file.stem
        # wmt25.ces-deu.deu.aya-expanse-8b-bnb-4bit-fp4.out.batch256.run1.stats
        # (testname).(src)-(tgt).(tgt).(model).out.batch(batch).run(run).stats
        # use regex to parse this
        m = re.match(r"(?P<testname>[^.]+)\.(?P<src>[^-]+)-(?P<tgt>[^.]+)\.(?P=tgt)\.(?P<model>.+)\.out\.batch(?P<batch>\d+)\.run(?P<run>\d+)\.stats", name)
        assert m, f"cannot parse {name}"
        d = m.groupdict()
        if d['model'] in EXCLUSIONS:
            print(f"Skipping excluded model {d['model']}")
            continue
        if d['testname'] != "wmt25":
            print(f"Skipping testname {d['testname']}")
            continue

        d['batch'] = int(d['batch'])
        d['run'] = int(d['run'])
        with stat_file.open() as f:
            try:
                stats = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {stat_file}: {e}\nSkipping this file.")
                continue
        res = d
        #print("id", d, "stats:\n", json.dumps(stats, indent=2))
        d["time"] = stats["wall_time_sec"]
        d["participant"] = sub_name
        d["output_token_count"] = get_output_token_count(stat_file)
        # warmup time from stats of testnamed warmup
        # warmup tests was run for ces-deu only and batch1 only
        warmup_glob = f"ces-deu/warmup.ces-deu.deu.{d['model']}.out.batch1.run*.stats.json"
        warmup_stats = list(sub_dir.glob(warmup_glob))
        if warmup_stats:
            # average time
            warmups = []
            for wf in warmup_stats:
                wstats = json.loads(wf.read_text())
                warmups.append(wstats["wall_time_sec"])
            if len(warmups) < 3:
                print(f"Warning: only {len(warmups)} warmup stats found {warmup_stats}")
            d["warmup_time"] = sum(warmups) / len(warmups)
        else:
            #print(f"No warmup stats found {sub_dir}/{warmup_glob}; setting warmup_time to None")
            #d["warmup_time"] = None
            raise Exception(f"no warmup stats found {sub_dir}/{warmup_glob}")

        t2 = stats['end_timestamp'] - stats['start_timestamp']
        if abs(t2 - d["time"]) > 1e-3:
            raise Exception(f"time mismatch {d['time']} vs {t2}")

        d['output_rate_all'] = d['output_token_count'] / d['time']
        d['output_rate_excl_warmup'] = d['output_token_count'] / (d['time'] - d['warmup_time'])

        #print(json.dumps(d))
        all_stats.append(d)

df = pd.DataFrame(all_stats)
df

In [None]:
# ! pip install matplotlib seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

# Filter
df_filtered = df[(df.src == "ces") & (df.tgt == "deu")]

def plot(df_mean, name="Output Rate", figsize=(9,5)):
    for testname, df_t in df_mean.groupby("testname"):
        plt.figure(figsize=figsize)
        for (participant, model), df_m in df_t.groupby(["participant", "model"]):
            df_m_sorted = df_m.sort_values("batch")
            label = f"{participant}:{model}"
            plt.errorbar(
                df_m_sorted["batch"],
                df_m_sorted[f"output_rate_mean"],
                yerr=df_m_sorted[f"output_rate_std"],
                marker='o',
                capsize=3,
                linestyle='-',
                label=label
            )
        plt.title(f"{testname} - Output rate vs batch size (mean ± std)")
        plt.xlabel("Batch size")
        plt.ylabel(f"{name} (tokens/s)")
        plt.xscale("log")
        plt.legend(fontsize="small", bbox_to_anchor=(1.02, 1), loc="upper left")
        plt.grid(alpha=0.3)
        # xaxis ticks as 2^n
        ax = plt.gca()
        ticks = [2**i for i in range(0, 11)]
        ax.set_xticks(ticks)
        ax.set_xticklabels([str(t) for t in ticks])
        ax.set_xlim(min(ticks), max(ticks))

        plt.tight_layout()
        # save as pdf

        name_simpl = name.lower().strip()
        for char in " _()":
            name_simpl = name_simpl.replace(char, "-")
        name_simpl = name_simpl.replace("--", "-")
        plt.savefig(f"{name_simpl}.pdf")
        plt.show()


#
# Group: mean + std + count
group_cols = ['testname', 'src', 'tgt', 'model', 'batch', 'participant']
for field, name in {
    "output_rate_all": "Output Rate",
    "output_rate_excl_warmup": "Output Rate Excluding Warmup"
    }.items():
    df_mean = df_filtered.groupby(group_cols, as_index=False)\
            .agg(
                output_rate_mean=(field, 'mean'),
                output_rate_std=(field, 'std'),
                output_rate_n_runs=(field, 'size')
            )

    df_mean = df_mean.sort_values(
        ["testname", "src", "tgt", "batch", "participant", "model"]
    ).reset_index(drop=True)

    plot(df_mean, name=name)