Command:
```shell
lrzcpy /dss/dssfs02/lwp-dss-0001/pn76je/pn76je-dss-0000/vllm/logs-2026-01-20
```

In [None]:
import pandas as pd
import seaborn as sns

sns.set_context("talk")

In [None]:
import os
import re
import ast

folder = "/home/tomasruiz/code/vllm-scripts/slurm/results/logs-2026-01-21/"


def find_params(dirname: str) -> list:
    """
    directory name has the format:
        "<somename>-(sd|nosd)-t(<temp>)-tp(<tensor-parallel-size>)"
    e.g.
        "vllm-throughput-nosd-t0.0-tp4"

    Returns:
        list: List of dictionaries, one per benchmark result
    """
    if "tp" in dirname:
        *_, temp, tp = dirname.split("-")
    else:
        *_, temp, tp = dirname.split("-")
        tp = None
    with open(os.path.join(folder, dirname, "0_out.logs"), "r") as f:
        text = f.read()
    parsed_results = parse_benchmark_results(text)

    # Create one row per benchmark result
    rows = []
    for parsed in parsed_results:
        rows.append(
            {
                "directory": dirname,
                # "sd": sd == "sd", <<< We read SD method now from the vLLM arguments
                "temp": float(temp.replace("t", "")),
                "tp": int(tp.replace("tp", "")) if tp else None,
                **parsed,
            }
        )

    return rows


def parse_benchmark_results(text: str):
    """
    Parse benchmark results from vLLM serving benchmark output.
    Handles multiple benchmark result sections in the same log file.

    Returns:
        list: List of dictionaries, each containing parsed metrics for one benchmark run
    """
    # Parse sd method from the full text (appears in header, not in benchmark sections)
    # e.g. 'speculative_config': {'method': 'draft_model', ...
    method_match = re.search(r"'speculative_config': (\{.*?\})", text, re.DOTALL)
    sd_results = {"sd_method": "None", "num_spec_toks": 0, "draft_model": "None"}
    if method_match:
        sd_data = ast.literal_eval(method_match.group(1))
        sd_results["sd_method"] = sd_data["method"]
        sd_results["num_spec_toks"] = sd_data["num_speculative_tokens"]
        sd_results["draft_model"] = sd_data["model"]

    # match dataset_path='likaixin/InstructCoder'
    dataset_match = re.search(r"dataset_path='(.*?)'", text)
    if dataset_match:
        sd_results["dataset"] = dataset_match.group(1)

    # Split text by benchmark result sections
    sections = re.split(r"============ Serving Benchmark Result ============", text)

    all_results = []

    for section in sections[1:]:  # Skip the first split (content before first section)
        # Use the sd_method found in the header for all benchmark results
        results = {**sd_results}

        # Parse benchmark duration
        concurrency_match = re.search(r"Maximum request concurrency:\s+(\d+)", section)
        if concurrency_match:
            results["concurrency"] = int(concurrency_match.group(1))

        duration_match = re.search(r"Benchmark duration \(s\):\s+(\d+\.\d+)", section)
        if duration_match:
            results["benchmark_duration_s"] = float(duration_match.group(1))

        n_reqs = re.search(r"Successful requests:\s+(\d+)", section)
        if n_reqs:
            results["n_reqs"] = int(n_reqs.group(1))

        req_throughput = re.search(
            r"Request throughput \(req/s\):\s+(\d+\.\d+)", section
        )
        if req_throughput:
            results["req_throughput"] = float(req_throughput.group(1))

        acceptance_rate = re.search(r"Acceptance rate \(%\):\s+(\d+\.\d+)", section)
        if acceptance_rate:
            results["acceptance_rate"] = float(acceptance_rate.group(1))

        acceptance_length = re.search(r"Acceptance length:\s+(\d+\.\d+)", section)
        if acceptance_length:
            results["acceptance_length"] = float(acceptance_length.group(1))

        # Parse TTFT (Time to First Token)
        ttft_mean = re.search(r"Mean TTFT \(ms\):\s+(\d+\.\d+)", section)
        ttft_median = re.search(r"Median TTFT \(ms\):\s+(\d+\.\d+)", section)
        ttft_p99 = re.search(r"P99 TTFT \(ms\):\s+(\d+\.\d+)", section)

        results["mean_ttft"] = float(ttft_mean.group(1)) if ttft_mean else None
        results["median_ttft"] = float(ttft_median.group(1)) if ttft_median else None
        results["p99_ttft"] = float(ttft_p99.group(1)) if ttft_p99 else None

        # Parse TPOT (Time per Output Token)
        tpot_mean = re.search(r"Mean TPOT \(ms\):\s+(\d+\.\d+)", section)
        tpot_median = re.search(r"Median TPOT \(ms\):\s+(\d+\.\d+)", section)
        tpot_p99 = re.search(r"P99 TPOT \(ms\):\s+(\d+\.\d+)", section)

        results["mean_tpot"] = float(tpot_mean.group(1)) if tpot_mean else None
        results["median_tpot"] = float(tpot_median.group(1)) if tpot_median else None
        results["p99_tpot"] = float(tpot_p99.group(1)) if tpot_p99 else None

        # Parse ITL (Inter-token Latency)
        itl_mean = re.search(r"Mean ITL \(ms\):\s+(\d+\.\d+)", section)
        itl_median = re.search(r"Median ITL \(ms\):\s+(\d+\.\d+)", section)
        itl_p99 = re.search(r"P99 ITL \(ms\):\s+(\d+\.\d+)", section)

        results["mean_itl"] = float(itl_mean.group(1)) if itl_mean else None
        results["median_itl"] = float(itl_median.group(1)) if itl_median else None
        results["p99_itl"] = float(itl_p99.group(1)) if itl_p99 else None

        # Parse total token throughput
        throughput_match = re.search(
            r"Total token throughput \(tok/s\):\s+(\d+\.\d+)", section
        )
        results["total_token_throughput"] = (
            float(throughput_match.group(1)) if throughput_match else None
        )

        # Only add if we found at least some metrics
        if results:
            all_results.append(results)

    return all_results


dirs = os.listdir(folder)
throughput_dirs = [d for d in dirs if "throughput" in d]
all_rows = [find_params(d) for d in throughput_dirs]
rows = [row for sublist in all_rows for row in sublist]
tp_wide_allk = pd.DataFrame(rows)
tp_wide_allk["dataset"] = (
    tp_wide_allk["dataset"].str.split("/").str[1]
)  # shorten dataset
# filter
dataset = "mt-bench"
dataset = "InstructCoder"
tp_wide_allk = tp_wide_allk.query("tp == 1 and dataset == @dataset")
tp_wide_allk.head(2)

# Analysis of Best Configs

In [None]:
def mk_ratios_long(wide):
    keys = ["concurrency"]
    baseline = wide.query("sd_method == 'None'")[[*keys, "total_token_throughput"]]
    non_baseline = wide.query("sd_method != 'None'")[
        ["sd_method", "draft_model", "num_spec_toks", *keys, "total_token_throughput"]
    ]
    assert not non_baseline.drop(columns=["total_token_throughput"]).duplicated().any()
    ratios = baseline.merge(non_baseline, on=keys, suffixes=["_baseline", ""])
    ratios["ratio"] = (
        ratios["total_token_throughput"] / ratios["total_token_throughput_baseline"]
    )
    return ratios


def rename_sd_method(df):
    map = {
        "draft_model": "Draft Model",
        "eagle3": "Eagle3",
        "None": "None",
    }
    return df.assign(**{"sd_method": df["sd_method"].map(map)})


ratios = mk_ratios_long(tp_wide_allk)
ratios

In [None]:
!mkdir -p imgs

In [None]:
concurrency_ticks = tp_wide_allk["concurrency"].unique()

axs = sns.relplot(
    ratios.query("sd_method == 'draft_model'"),
    x="concurrency",
    y="ratio",
    col="draft_model",
    hue="num_spec_toks",
    kind="line",
    marker="o",
)
axs.set_titles("Draft Model: {col_name}")
for ax in axs.axes.flat:
    ax.grid(True, alpha=0.5, axis="y")
    ax.set_ylim(0, None)
    ax.set_xscale("log")
    ax.set_xticks(concurrency_ticks, labels=concurrency_ticks)
    ax.minorticks_off()
    ax.set_ylabel("Speedup Ratio")
    ax.set_xlabel("Batch Size")
    ax.axhline(1, color="k", linestyle="--")

sns.move_legend(
    axs,
    "upper center",
    ncol=3,
    bbox_to_anchor=(0.5, 1.2),
    title="Number of Speculative Tokens",
)
axs.figure.tight_layout()
axs.figure.savefig("imgs/draft_model_ratios.png", dpi=300, bbox_inches="tight")
# best config in low concurrency is num_spec_toks=4, draft_model=Qwen3-1.7B

In [None]:
best_draft_model = tp_wide_allk.query(
    "sd_method == 'draft_model' and num_spec_toks == 4 and draft_model == 'Qwen/Qwen3-1.7B'"
)
assert len(best_draft_model) != 0

In [None]:
axs = sns.relplot(
    ratios.query("sd_method == 'eagle3'"),
    x="concurrency",
    y="ratio",
    hue="num_spec_toks",
    kind="line",
    marker="o",
    aspect=1.3,  # make figure wider
)
for ax in axs.axes.flat:
    ax.grid(True, alpha=0.5, axis="y")
    ax.set_ylim(0, None)
    ax.set_xscale("log")
    ax.set_xticks(concurrency_ticks, labels=concurrency_ticks)
    ax.minorticks_off()
    ax.set_ylabel("Speedup Ratio")
    ax.set_xlabel("Batch Size")
    ax.axhline(1, color="k", linestyle="--")

sns.move_legend(
    axs,
    "upper center",
    ncol=3,
    bbox_to_anchor=(0.45, 1.2),
    title="Number of Speculative Tokens",
)
# best config in low concurrency is num_spec_toks=4

In [None]:
best_eagle3 = tp_wide_allk.query("sd_method == 'eagle3' and num_spec_toks == 4")
assert len(best_eagle3) != 0

In [None]:
baseline = tp_wide_allk.query("sd_method == 'None'")
tp_wide = pd.concat([best_draft_model, best_eagle3, baseline])
tp_wide.head(2)

In [None]:
def mk_long(wide):
    long = wide.melt(
        id_vars=[
            "directory",
            "sd_method",
            "draft_model",
            "num_spec_toks",
            "concurrency",
            "temp",
        ],
        # value_vars=["mean_tpot", "mean_itl", "mean_ttft"],
        value_vars=["median_tpot", "median_itl", "median_ttft"],
        # value_vars=["p99_tpot", "p99_itl", "p99_ttft"],
    )
    return long


tp_long = mk_long(tp_wide)
tp_long.head(2)

In [None]:
import seaborn as sns


sns.set_context("talk")

fg = sns.relplot(
    rename_sd_method(tp_long.query("temp == 0.0")),
    x="concurrency",
    y="value",
    hue="sd_method",
    style="sd_method",
    col="variable",
    col_wrap=3,
    kind="line",
    marker="o",
    facet_kws={"sharey": False},
)

for i, ax in enumerate(fg.axes.flat):
    ax.grid(True, alpha=0.5)
    ax.set_xscale("log")
    if i == 2:  # last diagram
        ax.set_yscale("log")
    else:
        ax.set_ylim(0, None)
    ax.set_xticks(concurrency_ticks, labels=concurrency_ticks)
    ax.minorticks_off()  # Hide minor ticks


fg.set_ylabels("Time (ms)")
fg.set_xlabels("Batch Size")
fg.set_titles("Metric: {col_name}")

# Set legend location above the plots and make it horizontal
fg.legend.set_title("Speculative Decoding")
sns.move_legend(fg, "upper center", ncol=3, bbox_to_anchor=(0.45, 1.2))
fg.figure.tight_layout()
fg.figure.savefig("imgs/ttft_itl_tpot.png", dpi=300, bbox_inches="tight")

In [None]:
fig = sns.lineplot(
    tp_wide.query("temp == 0.0"),
    x="concurrency",
    y="req_throughput",
    hue="sd_method",
    style="sd_method",
    marker="o",
)
fig.grid(True, alpha=0.5)
fig.set_xscale("log")
fig.set_yscale("log")
fig.set_xlabel("Batch Size")
# yticks = [10, 20, 50, 100, 200, 500]
# axs.set_yticks(yticks, labels=yticks)
fig.set_xticks(concurrency_ticks, labels=concurrency_ticks)
fig.minorticks_off()  # Hide minor ticks
fig.set_ylabel("Request Throughput (req/s)")
fig.legend(title="Speculative Decoding")

In [None]:
# SHORT COMPARISON
fig = sns.barplot(
    rename_sd_method(
        tp_wide.query("temp == 0.0 and sd_method != 'eagle3' and concurrency <= 64")
    ),
    x="concurrency",
    y="total_token_throughput",
    hue="sd_method",
)
fig.set_title("Token Throughput (tok/s)")
fig.set_ylabel("")
fig.set_xlabel("Batch Size")
fig.legend(title="Speculative Decoding")
fig.grid(True, alpha=0.5, axis="y")
fig.figure.tight_layout()
fig.figure.savefig(
    "imgs/total_token_throughput_short.png", dpi=300, bbox_inches="tight"
)

In [None]:
# FULL COMPARISON
fig = sns.barplot(
    rename_sd_method(tp_wide.query("temp == 0.0")),
    x="concurrency",
    y="total_token_throughput",
    hue="sd_method",
)
fig.set_title("Total Token Throughput (tok/s)")
fig.set_ylabel("")
fig.set_xlabel("Batch Size")
fig.legend(title="Speculative Decoding")
fig.grid(True, alpha=0.5, axis="y")
fig.figure.tight_layout()
fig.figure.savefig("imgs/total_token_throughput_full.png", dpi=300, bbox_inches="tight")

In [None]:
fig = sns.barplot(
    rename_sd_method(mk_ratios_long(tp_wide)),
    x="concurrency",
    y="ratio",
    hue="sd_method",
)
fig.set_ylim(0, None)
fig.axhline(1, color="k", linestyle="--")
fig.set_xlabel("Batch Size")
fig.set_ylabel("Speedup Ratio")
fig.grid(True, alpha=0.5, axis="y")

fig.legend(loc="upper center", bbox_to_anchor=(0.5, 1.4), ncol=2, title="Method")

# AL and AR Analysis

In [None]:
axs = sns.relplot(
    rename_sd_method(tp_wide_allk.query("sd_method != 'None'")),
    x="num_spec_toks",
    y="acceptance_length",
    style="sd_method",
    hue="draft_model",
    kind="line",
    marker="o",
    aspect=1.3,
)
axs.set_titles("#SpecTokens = {col_name}")
for ax in axs.axes.flat:
    ax.grid(True, alpha=0.5, axis="y")
    ax.set_ylim(0, None)
    ax.set_ylabel("Acceptance Length")
    ax.set_xlabel("Number of Speculative Tokens")
sns.move_legend(
    axs,
    "right",
    ncol=1,
    bbox_to_anchor=(1.02, 0.5),
    title="Method",
)