Command:
```shell
lrzcpy /dss/dssfs02/lwp-dss-0001/pn76je/pn76je-dss-0000/vllm/logs-2026-01-13
```

In [None]:
import pandas as pd

In [None]:
import os
import re

folder = "/home/tomasruiz/code/vllm-scripts/slurm/results/logs-2026-01-13/"


def find_params(dirname: str) -> dict:
    """
    directory name has the format:
        "<somename>-(sd|nosd)-c(<concurrency>)-t(<temp>)-tp(<tensor-parallel-size>)"
    e.g.
        "vllm-throughput-nosd-c16-t0.0-tp4"
    """
    if "tp" in dirname:
        *_, sd, concurrency, temp, tp = dirname.split("-")
    else:
        *_, sd, concurrency, temp = dirname.split("-")
        tp = None
    with open(os.path.join(folder, dirname, "0_out.logs"), "r") as f:
        text = f.read()
    parsed = parse_benchmark_results(text)
    return {
        "directory": dirname,
        "sd": sd == "sd",
        "concurrency": int(concurrency.replace("c", "")),
        "temp": float(temp.replace("t", "")),
        "tp": int(tp.replace("tp", "")),
        **parsed,
    }


def parse_benchmark_results(text: str):
    """
    Parse benchmark results from vLLM serving benchmark output.

    Returns:
        dict: Flat dictionary with parsed metrics
    """
    results = {}

    # Parse benchmark duration
    duration_match = re.search(r"Benchmark duration \(s\):\s+(\d+\.\d+)", text)
    if duration_match:
        results["benchmark_duration_s"] = float(duration_match.group(1))

    # Parse TTFT (Time to First Token)
    ttft_mean = re.search(r"Mean TTFT \(ms\):\s+(\d+\.\d+)", text)
    ttft_median = re.search(r"Median TTFT \(ms\):\s+(\d+\.\d+)", text)
    ttft_p99 = re.search(r"P99 TTFT \(ms\):\s+(\d+\.\d+)", text)

    results["mean_ttft"] = float(ttft_mean.group(1)) if ttft_mean else None
    results["median_ttft"] = float(ttft_median.group(1)) if ttft_median else None
    results["p99_ttft"] = float(ttft_p99.group(1)) if ttft_p99 else None

    # Parse TPOT (Time per Output Token)
    tpot_mean = re.search(r"Mean TPOT \(ms\):\s+(\d+\.\d+)", text)
    tpot_median = re.search(r"Median TPOT \(ms\):\s+(\d+\.\d+)", text)
    tpot_p99 = re.search(r"P99 TPOT \(ms\):\s+(\d+\.\d+)", text)

    results["mean_tpot"] = float(tpot_mean.group(1)) if tpot_mean else None
    results["median_tpot"] = float(tpot_median.group(1)) if tpot_median else None
    results["p99_tpot"] = float(tpot_p99.group(1)) if tpot_p99 else None

    # Parse ITL (Inter-token Latency)
    itl_mean = re.search(r"Mean ITL \(ms\):\s+(\d+\.\d+)", text)
    itl_median = re.search(r"Median ITL \(ms\):\s+(\d+\.\d+)", text)
    itl_p99 = re.search(r"P99 ITL \(ms\):\s+(\d+\.\d+)", text)

    results["mean_itl"] = float(itl_mean.group(1)) if itl_mean else None
    results["median_itl"] = float(itl_median.group(1)) if itl_median else None
    results["p99_itl"] = float(itl_p99.group(1)) if itl_p99 else None
    
    # Parse total token throughput
    throughput_match = re.search(r"Total token throughput \(tok/s\):\s+(\d+\.\d+)", text)
    results["total_token_throughput"] = float(throughput_match.group(1)) if throughput_match else None

    return results


dirs = os.listdir(folder)
throughput_dirs = [d for d in dirs if "throughput" in d]
rows = [find_params(d) for d in throughput_dirs]
tp_wide = pd.DataFrame(rows).query("tp == 1")
tp_wide.head(2)

In [None]:
tp_long = tp_wide.melt(
    id_vars=["directory", "sd", "concurrency", "temp"],
    value_vars=["mean_tpot", "mean_itl", "mean_ttft"],
    # value_vars=["median_tpot", "median_itl", "median_ttft"],
    # value_vars=["p99_tpot", "p99_itl", "p99_ttft"],
)
tp_long.head(2)

In [None]:
import seaborn as sns

sns.set_context("talk")

fg = sns.relplot(
    tp_long.query("temp == 0.0"),
    x="concurrency",
    y="value",
    hue="sd",
    style="sd",
    col="variable",
    col_wrap=3,
    kind="line",
    marker="o",
    facet_kws={"sharey": False},
)

concurrency_ticks = tp_long["concurrency"].unique()
for ax in fg.axes.flat:
    ax.grid(True, alpha=0.5)
    ax.set_ylim(0, None)
    ax.set_xscale("log")
    ax.set_xticks(concurrency_ticks, labels=concurrency_ticks)
    ax.minorticks_off()  # Hide minor ticks

fg.set_ylabels("Time (ms)")
fg.set_xlabels("Concurrency")
fg.set_titles("Metric: {col_name}")

# Set legend location above the plots and make it horizontal
fg.legend.set_title("Speculative Decoding")
sns.move_legend(fg, "upper center", ncol=2, bbox_to_anchor=(0.45, 1.2))
# fg.figure.set_size_inches(w=15, h=7)

In [None]:
axs = sns.lineplot(
    tp_wide.query("temp == 0.0"),
    x="concurrency",
    y="benchmark_duration_s",
    hue="sd",
    style="sd",
    marker="o",
)
axs.grid(True, alpha=0.5)
axs.set_xscale("log")
axs.set_yscale("log")
axs.set_xlabel("Concurrency")
yticks = [10, 20, 50, 100, 200, 500]
axs.set_yticks(yticks, labels=yticks)
axs.set_xticks(concurrency_ticks, labels=concurrency_ticks)
axs.minorticks_off()  # Hide minor ticks
axs.set_title("Benchmark Duration")
axs.set_ylabel("Time (s)")
axs.legend(title="Speculative Decoding")

In [None]:
axs = sns.barplot(
    tp_wide.query("temp == 0.0"),
    x="concurrency",
    y="total_token_throughput",
    hue="sd",
)
axs.set_title("Total Token Throughput")
axs.set_ylabel("Tokens/s")
axs.set_xlabel("Concurrency")
axs.legend(title="Speculative Decoding")
axs.grid(True, alpha=0.5, axis="y")

In [None]:
tp_ratios = tp_wide.query("temp == 0.0")[["sd", "concurrency", "total_token_throughput"]].pivot(
    index="concurrency", columns="sd", values="total_token_throughput"
).assign(ratio=lambda df: df[True] / df[False])
tp_ratios

In [None]:
axs = sns.barplot(
    tp_ratios,
    x="concurrency",
    y="ratio",
)
axs.set_ylim(0, None)
axs.axhline(1, color="k", linestyle="--")
axs.set_xlabel("Concurrency")
axs.set_ylabel("Ratio")
axs.set_title("Total Token Throughput Ratio")
axs.grid(True, alpha=0.5, axis="y")