In [None]:
import pandas as pd
import seaborn as sns

sns.set_context("talk")

In [None]:
from pathlib import Path
import os
import re
import ast

analysis = "qwen"
analysis = "llama70b"

# Params for Qwen analysis
folder = "./logs-2026-01-23/"
dataset = "mt-bench"
dataset = "InstructCoder"


def load_qwen_results():
    dirs = os.listdir(folder)
    dirs = [Path(folder) / d for d in dirs]
    throughput_dirs = [d for d in dirs if "throughput" in str(d)]
    texts = ((d / "0_out.logs").read_text() for d in throughput_dirs)
    all_rows = [find_params(t) for t in texts]
    rows = [row for sublist in all_rows for row in sublist]
    wide = pd.DataFrame(rows)
    wide = shorten_dataset(wide)
    wide = wide.query("dataset == @dataset")
    # Save full dataset
    # wide.to_parquet("data-2026-01-23.parquet")
    return wide


def load_llama70b_results():
    ks = [3, 4, 5, 6, 7, 8]

    directories = ["../online-throughput/tp-results-nosd/"]
    for k in ks:
        directories.append(f"../online-throughput/k{k}/tp-results-draft-model/")
        directories.append(f"../online-throughput/k{k}/tp-results-eagle3/")
    server_logfiles = ["../online-throughput/tp-results-nosd/nosd-serve.log"]
    for k in ks:
        server_logfiles.append(
            f"../online-throughput/k{k}/tp-results-draft-model/draft-model-serve.log"
        )
        server_logfiles.append(
            f"../online-throughput/k{k}/tp-results-eagle3/eagle3-serve.log"
        )
    rows = []
    for dir, server_logfile in zip(directories, server_logfiles):
        static_data = extract_server_params(Path(server_logfile).read_text())
        for file in os.listdir(dir):
            if "bench" in file:
                client_logfile = Path(dir) / file
                client_text = client_logfile.read_text()
                row = extract_bench_metrics(client_text) | static_data
                rows.append(row)

    wide = pd.DataFrame(rows)
    wide = shorten_dataset(wide)
    return wide


def extract_bench_metrics(client_logs: str) -> dict:
    section = client_logs
    results = {}

    # match dataset_path='likaixin/InstructCoder'
    dataset_match = re.search(r"dataset_path='(.*?)'", section)
    results["dataset"] = dataset_match.group(1)

    temp_match = re.search(r"temperature=([^,]+),", section)
    if temp_match:
        results["temp"] = float(temp_match.group(1))

    # Parse benchmark duration
    concurrency_match = re.search(r"Maximum request concurrency:\s+(\d+)", section)
    if concurrency_match:
        results["concurrency"] = int(concurrency_match.group(1))

    duration_match = re.search(r"Benchmark duration \(s\):\s+(\d+\.\d+)", section)
    if duration_match:
        results["benchmark_duration_s"] = float(duration_match.group(1))

    n_reqs = re.search(r"Successful requests:\s+(\d+)", section)
    if n_reqs:
        results["n_reqs"] = int(n_reqs.group(1))

    req_throughput = re.search(r"Request throughput \(req/s\):\s+(\d+\.\d+)", section)
    if req_throughput:
        results["req_throughput"] = float(req_throughput.group(1))

    acceptance_rate = re.search(r"Acceptance rate \(%\):\s+(\d+\.\d+)", section)
    if acceptance_rate:
        results["acceptance_rate"] = float(acceptance_rate.group(1))

    acceptance_length = re.search(r"Acceptance length:\s+(\d+\.\d+)", section)
    if acceptance_length:
        results["acceptance_length"] = float(acceptance_length.group(1))

    # Parse TTFT (Time to First Token)
    ttft_mean = re.search(r"Mean TTFT \(ms\):\s+(\d+\.\d+)", section)
    ttft_median = re.search(r"Median TTFT \(ms\):\s+(\d+\.\d+)", section)
    ttft_p99 = re.search(r"P99 TTFT \(ms\):\s+(\d+\.\d+)", section)

    results["mean_ttft"] = float(ttft_mean.group(1)) if ttft_mean else None
    results["median_ttft"] = float(ttft_median.group(1)) if ttft_median else None
    results["p99_ttft"] = float(ttft_p99.group(1)) if ttft_p99 else None

    # Parse TPOT (Time per Output Token)
    tpot_mean = re.search(r"Mean TPOT \(ms\):\s+(\d+\.\d+)", section)
    tpot_median = re.search(r"Median TPOT \(ms\):\s+(\d+\.\d+)", section)
    tpot_p99 = re.search(r"P99 TPOT \(ms\):\s+(\d+\.\d+)", section)

    results["mean_tpot"] = float(tpot_mean.group(1)) if tpot_mean else None
    results["median_tpot"] = float(tpot_median.group(1)) if tpot_median else None
    results["p99_tpot"] = float(tpot_p99.group(1)) if tpot_p99 else None

    # Parse ITL (Inter-token Latency)
    itl_mean = re.search(r"Mean ITL \(ms\):\s+(\d+\.\d+)", section)
    itl_median = re.search(r"Median ITL \(ms\):\s+(\d+\.\d+)", section)
    itl_p99 = re.search(r"P99 ITL \(ms\):\s+(\d+\.\d+)", section)

    results["mean_itl"] = float(itl_mean.group(1)) if itl_mean else None
    results["median_itl"] = float(itl_median.group(1)) if itl_median else None
    results["p99_itl"] = float(itl_p99.group(1)) if itl_p99 else None

    # Parse total token throughput
    throughput_match = re.search(
        r"Total token throughput \(tok/s\):\s+(\d+\.\d+)", section
    )
    results["total_token_throughput"] = (
        float(throughput_match.group(1)) if throughput_match else None
    )
    return results


def shorten_dataset(df):
    df["dataset"] = df["dataset"].str.split("/").str[1]
    return df


def extract_server_params(server_logs: str) -> dict:
    text = server_logs
    # Parse sd method from the full text (appears in header, not in benchmark sections)
    # e.g. 'speculative_config': {'method': 'draft_model', ...
    method_match = re.search(r"'speculative_config': (\{.*?\})", text, re.DOTALL)
    static_data = {"sd_method": "None", "num_spec_toks": 0, "draft_model": "None"}
    if method_match:
        sd_data = ast.literal_eval(method_match.group(1))
        static_data["sd_method"] = sd_data["method"]
        static_data["num_spec_toks"] = sd_data["num_speculative_tokens"]
        static_data["draft_model"] = sd_data["model"]

    # match tensor_parallel_size=1
    tp_match = re.search(r"tensor_parallel_size=(\d+)", text)
    if tp_match:
        static_data["tp"] = int(tp_match.group(1))
    return static_data


def find_params(text: str):
    """
    Parse benchmark results from vLLM serving benchmark output.
    Handles multiple benchmark result sections in the same log file.

    Returns:
        list: List of dictionaries, each containing parsed metrics for one benchmark run
    """
    static_data = extract_server_params(text)

    # Split text by benchmark result sections
    delimiter = "Namespace"
    sections = text.split(delimiter)
    sections = sections[:1] + [delimiter + s for s in sections[1:]]

    all_results = []
    for section in sections[1:]:  # Skip the first split (content before first section)
        # Use the sd_method found in the header for all benchmark results
        results = extract_bench_metrics(client_logs=section)
        results = results | static_data
        all_results.append(results)

    return all_results


if analysis == "qwen":
    tp_wide_allk = load_qwen_results()
elif analysis == "llama70b":
    tp_wide_allk = load_llama70b_results()
else:
    raise NotImplementedError()

In [None]:
tp_wide_allk.head(2)

In [None]:
folder = f"imgs/{analysis}/{dataset}"
os.makedirs(folder, exist_ok=True)

# Analysis of Best Configs

In [None]:
def mk_ratios_long(wide):
    keys = ["concurrency"]
    baseline = wide.query("sd_method == 'None'")[[*keys, "total_token_throughput"]]
    non_baseline = wide.query("sd_method != 'None'")[
        ["sd_method", "draft_model", "num_spec_toks", *keys, "total_token_throughput"]
    ]
    assert not non_baseline.drop(columns=["total_token_throughput"]).duplicated().any()
    ratios = baseline.merge(non_baseline, on=keys, suffixes=["_baseline", ""])
    ratios["ratio"] = (
        ratios["total_token_throughput"] / ratios["total_token_throughput_baseline"]
    )
    return ratios


def rename_sd_method(df):
    map = {
        "draft_model": "Draft Model",
        "eagle3": "Eagle3",
        "None": "None",
    }
    return df.assign(**{"sd_method": df["sd_method"].map(map)})


ratios = mk_ratios_long(tp_wide_allk)
ratios.head(2)

In [None]:
# concurrency_ticks = tp_wide_allk["concurrency"].unique()
if dataset == "InstructCoder":
    concurrency_ticks = [1, 2, 4, 8, 16, 32, 64, 128, 256]
else:
    concurrency_ticks = [1, 2, 4, 8, 16, 32, 64, 80]

palette = "viridis"

axs = sns.relplot(
    ratios.query("sd_method == 'draft_model'"),
    x="concurrency",
    y="ratio",
    col="draft_model",
    hue="num_spec_toks",
    palette=palette,
    kind="line",
    marker="o",
)
axs.set_titles("Draft Model: {col_name}")
for ax in axs.axes.flat:
    ax.grid(True, alpha=0.5, axis="y")
    if analysis == "qwen":
        ymax = 2.5
    elif analysis == "llama70b":
        ymax = 4
    ax.set_ylim(0, ymax)
    ax.set_xscale("log")
    ax.set_xticks(concurrency_ticks, labels=concurrency_ticks)
    ax.minorticks_off()
    ax.set_ylabel("Speedup Ratio")
    ax.set_xlabel("Batch Size")
    ax.axhline(1, color="k", linestyle="--")

sns.move_legend(
    axs,
    "upper center",
    ncol=3,
    bbox_to_anchor=(0.5, 1.2),
    title="Number of Speculative Tokens",
)
axs.figure.tight_layout()
axs.figure.savefig(folder + "/draft_model_ratios.png", dpi=300, bbox_inches="tight")
# best config in low concurrency is num_spec_toks=4, draft_model=Qwen3-1.7B

In [None]:
if analysis == "qwen":
    draft_model = "Qwen/Qwen3-1.7B"
elif analysis == "llama70b":
    draft_model = "meta-llama/Llama-3.2-1B"

if analysis == "qwen":
    best_k = 4
elif analysis == "llama70b":
    best_k = 7

best_draft_model = tp_wide_allk.query(
    "sd_method == 'draft_model' and num_spec_toks == @best_k and draft_model == @draft_model"
)
assert len(best_draft_model) != 0

In [None]:
axs = sns.relplot(
    ratios.query("sd_method == 'eagle3'"),
    x="concurrency",
    y="ratio",
    hue="num_spec_toks",
    palette=palette,
    kind="line",
    marker="o",
    aspect=1.3,  # make figure wider
)
for ax in axs.axes.flat:
    ax.grid(True, alpha=0.5, axis="y")
    ax.set_ylim(0, ymax)
    ax.set_xscale("log")
    ax.set_xticks(concurrency_ticks, labels=concurrency_ticks)
    ax.minorticks_off()
    ax.set_ylabel("Speedup Ratio")
    ax.set_xlabel("Batch Size")
    ax.axhline(1, color="k", linestyle="--")

sns.move_legend(
    axs,
    "upper center",
    ncol=3,
    bbox_to_anchor=(0.45, 1.2),
    title="Number of Speculative Tokens",
)
axs.figure.tight_layout()
axs.figure.savefig(folder + "/eagle3_ratios.png", dpi=300, bbox_inches="tight")
# best config in low concurrency is num_spec_toks=4

In [None]:
best_eagle3 = tp_wide_allk.query("sd_method == 'eagle3' and num_spec_toks == @best_k")
assert len(best_eagle3) != 0

In [None]:
baseline = tp_wide_allk.query("sd_method == 'None'")
tp_wide = pd.concat([best_draft_model, best_eagle3, baseline])
tp_wide.head(2)

In [None]:
def mk_long(wide):
    long = wide.melt(
        id_vars=[
            # "directory",
            "sd_method",
            "draft_model",
            "num_spec_toks",
            "concurrency",
            "temp",
        ],
        # value_vars=["mean_ttft", "mean_tpot", "mean_itl"],
        # value_vars=["median_ttft", "median_tpot", "median_itl"],
        value_vars=["p99_ttft", "p99_tpot", "p99_itl"],
    )
    return long


tp_long = mk_long(tp_wide)
tp_long.head(2)

In [None]:
import seaborn as sns
import matplotlib.ticker as mticker

sns.set_context("talk")

# Format yticks as 1000 rather than 10^3
fmt = mticker.FuncFormatter(lambda x, _: "{:g}".format(x))

fg = sns.relplot(
    rename_sd_method(tp_long.query("temp == 0.0")),
    x="concurrency",
    y="value",
    hue="sd_method",
    style="sd_method",
    col="variable",
    col_wrap=3,
    kind="line",
    marker="o",
    facet_kws={"sharey": False},
)

for i, ax in enumerate(fg.axes.flat):
    ax.grid(True, alpha=0.5)
    ax.set_xscale("log")
    if i in [0, 1, 2]:  # 1st and 3rd diagram
        ax.set_yscale("log")
        ax.yaxis.set_major_formatter(fmt)
        # ax.yaxis.set_minor_formatter(fmt)
    else:
        ax.set_ylim(0, None)
    ax.set_xticks(concurrency_ticks, labels=concurrency_ticks)
    ax.xaxis.minorticks_off()  # Hide minor ticks

fg.set_ylabels("Time (ms)")
fg.set_xlabels("Batch Size")
fg.set_titles("Metric: {col_name}")

# Set legend location above the plots and make it horizontal
fg.legend.set_title("Speculative Decoding")
sns.move_legend(fg, "upper center", ncol=3, bbox_to_anchor=(0.45, 1.2))
fg.figure.tight_layout()
fg.figure.savefig(folder + "/ttft_itl_tpot.png", dpi=300, bbox_inches="tight")

In [None]:
fig = sns.lineplot(
    rename_sd_method(tp_wide.query("temp == 0.0")),
    x="concurrency",
    y="req_throughput",
    hue="sd_method",
    style="sd_method",
    marker="o",
)
fig.grid(True, alpha=0.5)
fig.set_xscale("log")
fig.set_yscale("log")
fig.set_xlabel("Batch Size")
yticks = [0.1, 0.5, 1, 2, 4, 8, 16]
fig.set_yticks(yticks, labels=yticks)
fig.set_xticks(concurrency_ticks, labels=concurrency_ticks)
fig.minorticks_off()  # Hide minor ticks
fig.set_title("Request Throughput (req/s)")
fig.set_ylabel("")
fig.legend(title="Speculative Decoding")

In [None]:
# SHORT COMPARISON
fig = sns.barplot(
    rename_sd_method(
        tp_wide.query("temp == 0.0 and sd_method != 'eagle3' and concurrency <= 80")
    ),
    x="concurrency",
    y="total_token_throughput",
    hue="sd_method",
)
fig.set_title("Token Throughput (tok/s)")
fig.set_ylabel("")
fig.set_xlabel("Batch Size")
fig.legend(title="Speculative Decoding")
fig.grid(True, alpha=0.5, axis="y")
fig.figure.tight_layout()
fig.figure.savefig(
    folder + "/total_token_throughput_short.png", dpi=300, bbox_inches="tight"
)

In [None]:
# FULL COMPARISON
fig = sns.barplot(
    rename_sd_method(tp_wide.query("temp == 0.0")),
    x="concurrency",
    y="total_token_throughput",
    hue="sd_method",
)
fig.set_title("Total Token Throughput (tok/s)")
fig.set_ylabel("")
fig.set_xlabel("Batch Size")
fig.legend(title="Speculative Decoding")
fig.grid(True, alpha=0.5, axis="y")
fig.figure.tight_layout()
fig.figure.savefig(
    folder + "/total_token_throughput_full.png", dpi=300, bbox_inches="tight"
)

In [None]:
fig = sns.barplot(
    rename_sd_method(mk_ratios_long(tp_wide)),
    x="concurrency",
    y="ratio",
    hue="sd_method",
)
fig.set_ylim(0, ymax)
fig.axhline(1, color="k", linestyle="--")
fig.set_xlabel("Batch Size")
fig.set_ylabel("Speedup Ratio")
fig.grid(True, alpha=0.5, axis="y")

sns.move_legend(fig, "lower center", ncol=2, bbox_to_anchor=(0.5, 1), title="Method")
fig.figure.tight_layout()
fig.figure.savefig(
    folder + "/draft_model_vs_eagle3_ratios.png", dpi=300, bbox_inches="tight"
)

# AL and AR Analysis

In [None]:
axs = sns.relplot(
    rename_sd_method(tp_wide_allk.query("sd_method != 'None'")),
    x="num_spec_toks",
    y="acceptance_length",
    style="sd_method",
    hue="draft_model",
    kind="line",
    marker="o",
    aspect=1.3,
)
axs.set_titles("#SpecTokens = {col_name}")

for ax in axs.axes.flat:
    ax.grid(True, alpha=0.5, axis="y")
    ax.set_ylim(0, 6)
    ax.set_ylabel("Acceptance Length")
    ax.set_xlabel("Number of Speculative Tokens")
sns.move_legend(
    axs,
    "right",
    ncol=1,
    bbox_to_anchor=(1.02, 0.5),
    title="Method",
)
axs.figure.savefig(
    folder + "/acceptance_length_vs_num_spec_toks.png", dpi=300, bbox_inches="tight"
)

In [None]:
ratios.query(
    "sd_method == 'draft_model' and num_spec_toks == 4 and draft_model == '@draft_model' and concurrency <= 64"
).round(2)

In [None]:
ratios.iloc[ratios["ratio"].argmax()]
