
# Benchmark Validation

Select a benchmark scenario via `SELECTED_SCENARIO` to compare measured TTFT/ITL values with the analytical estimators.


In [25]:

from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from utils.config import get_model_config, get_hardware_config
from utils.math_utils import (
    prefill_compute_time,
    prefill_memory_time,
    decode_compute_time,
    decode_memory_time,
)

pd.options.display.max_rows = 20
pd.options.display.max_columns = 20


@dataclass(frozen=True)
class EvalModelBenchmarkConfiguration:
    data_path: Path
    model_name: str
    csv_hardware_label: str
    hardware_key: str
    running_tokens_cap: float


In [26]:

EVAL_MODEL_BENCHMARK_CONFIGURATION = {
    "llama33_70b_fp8_tp2": EvalModelBenchmarkConfiguration(
        data_path=Path("tested_benchmarks/nim_llama33_70b_v1.8.0_2xH100_fp8TP2.csv"),
        model_name="llama33_70B",
        csv_hardware_label="H100_80G",
        hardware_key="H100_80GB_FP8_TP2",
        running_tokens_cap=3.03e5,
    ),
    "llama31_8b_fp8_tp1": EvalModelBenchmarkConfiguration(
        data_path=Path("tested_benchmarks/nim_llama3.1_8b_v1.8.0_1xH100_80GB.csv"),
        model_name="llama31_8B",
        csv_hardware_label="H100_80G",
        hardware_key="H100_80GB_FP8_TP1",
        running_tokens_cap=7.5e5,
    ),
}

SELECTED_SCENARIO = "llama33_70b_fp8_tp2"
CONFIG = EVAL_MODEL_BENCHMARK_CONFIGURATION[SELECTED_SCENARIO]
print(f"Scenario: {SELECTED_SCENARIO}")
print(f"Running tokens cap: {CONFIG.running_tokens_cap:,.0f}")


Scenario: llama33_70b_fp8_tp2
Running tokens cap: 303,000


## Load configuration and raw measurements

In [27]:

df_raw = pd.read_csv(CONFIG.data_path)

hardware_values = df_raw["hardware"].unique()
if len(hardware_values) != 1 or hardware_values[0] != CONFIG.csv_hardware_label:
    raise ValueError(
        f"Unexpected hardware labels {hardware_values}. Expected '{CONFIG.csv_hardware_label}'."
    )

df_raw["hardware_key"] = CONFIG.hardware_key
model_cfg = get_model_config(CONFIG.model_name)
hardware_cfg = get_hardware_config(CONFIG.hardware_key)

print(f"Loaded rows: {len(df_raw)}")
display(df_raw.head())


Loaded rows: 39


Unnamed: 0,version,hardware,hardware count,data_type,input_tokens,output_tokens,concurrency,TTFT_ms,ITL_ms,Throughput_tokens_per_s,hardware_key
0,1.8.0,H100_80G,2,fp8 TP2,5000,500,1,406.77,18.58,51.67,H100_80GB_FP8_TP2
1,1.8.0,H100_80G,2,fp8 TP2,5000,500,5,546.7,21.83,218.48,H100_80GB_FP8_TP2
2,1.8.0,H100_80G,2,fp8 TP2,5000,500,25,688.69,38.75,623.4,H100_80GB_FP8_TP2
3,1.8.0,H100_80G,2,fp8 TP2,5000,500,50,834.51,59.73,814.37,H100_80GB_FP8_TP2
4,1.8.0,H100_80G,2,fp8 TP2,5000,500,100,7996.25,92.91,917.26,H100_80GB_FP8_TP2



## Compute analytical estimates

For each scenario we compute the prefill (TTFT) and decode (ITL) bottlenecks, keep both compute/memory components, and convert them to milliseconds.


In [28]:

def estimate_row(row):
    S = float(row["concurrency"])
    L_prompt = float(row["input_tokens"])
    L_decode = L_prompt + float(row["output_tokens"])

    prefill_compute = prefill_compute_time(S, L_prompt, model_cfg, hardware_cfg)
    prefill_memory = prefill_memory_time(S, L_prompt, model_cfg, hardware_cfg)
    decode_compute = decode_compute_time(S, L_decode, model_cfg, hardware_cfg)
    decode_memory = decode_memory_time(S, L_decode, model_cfg, hardware_cfg, CONFIG.running_tokens_cap)

    ttft_compute_ms = float(prefill_compute * 1e3)
    ttft_memory_ms = float(prefill_memory * 1e3)
    itl_compute_ms = float(decode_compute * 1e3)
    itl_memory_ms = float(decode_memory * 1e3)

    running_tokens = S * L_decode

    return pd.Series({
        "prefill_compute_ms": ttft_compute_ms,
        "prefill_memory_ms": ttft_memory_ms,
        "ttft_model_ms": max(ttft_compute_ms, ttft_memory_ms),
        "prefill_limit": "compute" if ttft_compute_ms >= ttft_memory_ms else "memory",
        "decode_compute_ms": itl_compute_ms,
        "decode_memory_ms": itl_memory_ms,
        "itl_model_ms": max(itl_compute_ms, itl_memory_ms),
        "decode_limit": "compute" if itl_compute_ms >= itl_memory_ms else "memory",
        "running_tokens": min(running_tokens, CONFIG.running_tokens_cap),
        # "running_tokens_utilization": running_tokens / CONFIG.running_tokens_cap,
    })


df_eval = df_raw.join(df_raw.apply(estimate_row, axis=1))
df_eval["ttft_ratio"] = df_eval["ttft_model_ms"] / df_eval["TTFT_ms"]
df_eval["itl_ratio"] = df_eval["itl_model_ms"] / df_eval["ITL_ms"]
df_eval["ttft_delta_ms"] = df_eval["ttft_model_ms"] - df_eval["TTFT_ms"]
df_eval["itl_delta_ms"] = df_eval["itl_model_ms"] - df_eval["ITL_ms"]

df_eval = df_eval.sort_values(["input_tokens", "concurrency"]).reset_index(drop=True)
display(df_eval[[
    "input_tokens", "concurrency", "running_tokens",
    "TTFT_ms", "ttft_model_ms", "ttft_delta_ms",
    "ITL_ms", "itl_model_ms", "itl_delta_ms", "prefill_limit", "decode_limit"
]])


Unnamed: 0,input_tokens,concurrency,running_tokens,TTFT_ms,ttft_model_ms,ttft_delta_ms,ITL_ms,itl_model_ms,itl_delta_ms,prefill_limit,decode_limit
0,200,1,400.0,31.22,9.088180,-22.131820,18.80,8.893920,-9.906080,memory,memory
1,200,5,2000.0,88.70,14.986848,-73.713152,18.83,9.212405,-9.617595,compute,memory
2,200,25,10000.0,138.98,74.934240,-64.045760,20.61,10.804832,-9.805168,compute,memory
3,200,50,20000.0,156.28,149.868480,-6.411520,23.33,12.795366,-10.534634,compute,memory
4,200,100,40000.0,176.80,299.736960,122.936960,29.87,16.776433,-13.093567,compute,memory
...,...,...,...,...,...,...,...,...,...,...,...
34,20000,50,303000.0,123969.71,21543.759555,-102425.950445,49.22,68.109004,18.889004,compute,memory
35,20000,100,303000.0,333564.85,43087.519111,-290477.330889,49.81,68.109004,18.299004,compute,memory
36,20000,150,303000.0,543226.40,64631.278666,-478595.121334,50.01,68.109004,18.099004,compute,memory
37,20000,200,303000.0,753417.03,86175.038221,-667241.991779,50.06,68.109004,18.049004,compute,memory


In [29]:
# save results 
eval_folder = "evaluations_results"
df_eval.to_csv(f"{eval_folder}/eval_benchmark-{SELECTED_SCENARIO}.csv")

## Error summary by prompt length

In [30]:

summary = (
    df_eval
    .groupby("input_tokens")[[
        "TTFT_ms", "ttft_model_ms", "ttft_ratio",
        "ITL_ms", "itl_model_ms", "itl_ratio"
    ]]
    .agg(["mean", "median"])
)
summary


Unnamed: 0_level_0,TTFT_ms,TTFT_ms,ttft_model_ms,ttft_model_ms,ttft_ratio,ttft_ratio,ITL_ms,ITL_ms,itl_model_ms,itl_model_ms,itl_ratio,itl_ratio
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
input_tokens,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
200,163.13375,166.54,293.379558,224.80272,1.39442,1.327159,30.0375,26.6,16.587333,14.7859,0.539858,0.549159
500,842.567143,181.64,572.477245,377.154878,1.608066,1.277365,27.451429,23.97,38.479156,33.3365,1.265097,1.390759
1000,697.2525,306.425,1488.954408,1143.883028,2.677928,2.129607,33.95875,30.04,39.930156,38.261482,1.059739,1.156333
5000,24444.085,4415.38,8091.356375,6216.150864,1.354099,0.557665,63.865,76.295,49.397945,65.423282,0.760807,0.733082
20000,342615.69375,228767.28,42064.190532,32315.639333,0.274625,0.151478,42.78875,49.515,56.514247,68.109004,1.264458,1.361228



## Plots

The helpers below make it easy to compare measured vs. modelled values for any prompt length.


In [31]:

def plot_metric_vs_concurrency(metric_name, observed_col, model_col, prompt_tokens):
    subset = df_eval[df_eval["input_tokens"] == prompt_tokens].sort_values("concurrency")
    if subset.empty:
        raise ValueError(f"No rows with input_tokens={prompt_tokens}")
    melted = subset.melt(
        id_vars=["concurrency"],
        value_vars=[observed_col, model_col],
        var_name="series",
        value_name="value",
    )

    title = f"{metric_name} vs concurrency (prompt={prompt_tokens})"
    fig = px.line(melted, x="concurrency", y="value", color="series", markers=True, title=title)
    fig.update_layout(xaxis_title="Concurrency", yaxis_title=f"{metric_name} [ms]")
    fig.show()


for prompt_tokens in sorted(df_eval["input_tokens"].unique()):
    plot_metric_vs_concurrency("TTFT_ms", "TTFT_ms", "ttft_model_ms", prompt_tokens)

for prompt_tokens in sorted(df_eval["input_tokens"].unique()):
    plot_metric_vs_concurrency("ITL_ms", "ITL_ms", "itl_model_ms", prompt_tokens)


In [None]:

fig_ttft = px.scatter(
    df_eval,
    x="TTFT_ms",
    y="ttft_model_ms",
    color="input_tokens",
    hover_data=["concurrency", "prefill_limit", "running_tokens_utilization"],
    title="TTFT: measured vs modelled",
)
fig_ttft.add_trace(
    go.Scatter(
        x=[df_eval["TTFT_ms"].min(), df_eval["TTFT_ms"].max()],
        y=[df_eval["TTFT_ms"].min(), df_eval["TTFT_ms"].max()],
        mode="lines",
        name="ideal",
    )
)
fig_ttft.update_layout(xaxis_title="Measured TTFT [ms]", yaxis_title="Model TTFT [ms]")
fig_ttft.show()

fig_itl = px.scatter(
    df_eval,
    x="ITL_ms",
    y="itl_model_ms",
    color="input_tokens",
    hover_data=["concurrency", "decode_limit", "running_tokens_utilization"],
    title="ITL: measured vs modelled",
)
fig_itl.add_trace(
    go.Scatter(
        x=[df_eval["ITL_ms"].min(), df_eval["ITL_ms"].max()],
        y=[df_eval["ITL_ms"].min(), df_eval["ITL_ms"].max()],
        mode="lines",
        name="ideal",
    )
)
fig_itl.update_layout(xaxis_title="Measured ITL [ms]", yaxis_title="Model ITL [ms]")
fig_itl.show()



---

Adjust `SELECTED_SCENARIO` or extend `EVAL_MODEL_BENCHMARK_CONFIGURATION` as additional benchmark files become available.
