
# Benchmark Validation

Select a benchmark scenario via `SELECTED_SCENARIO` to compare measured TTFT/ITL values with the analytical estimators.


In [19]:
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display

from utils.config import get_model_config, get_hardware_config
from utils.math_utils import (
    total_prefill_time,
    prefill_compute_time,
    prefill_memory_time,
    decode_compute_time,
    decode_memory_time,
    prefill_memory_HBM_wall_time,
)

pd.options.display.max_rows = 20
pd.options.display.max_columns = 20


@dataclass(frozen=True)
class EvalModelBenchmarkConfiguration:
    data_path: Path
    model_name: str
    csv_hardware_label: str
    hardware_key: str
    decode_running_tokens_cap: float
    prefill_running_tokens_cap: float

In [20]:

EVAL_MODEL_BENCHMARK_CONFIGURATION = {
    "llama33_70b_fp8_tp2": EvalModelBenchmarkConfiguration(
        data_path=Path("tested_benchmarks/nim_llama33_70b_v1.8.0_2xH100_fp8TP2.csv"),
        model_name="llama33_70B",
        csv_hardware_label="H100_80G",
        hardware_key="H100_80GB_FP8_TP2",
        decode_running_tokens_cap=3.03e5,
        prefill_running_tokens_cap=1.0e5
    ),
    "llama31_8b_fp8_tp1": EvalModelBenchmarkConfiguration(
        data_path=Path("tested_benchmarks/nim_llama3.1_8b_v1.8.0_1xH100_80GB.csv"),
        model_name="llama31_8B",
        csv_hardware_label="H100_80G",
        hardware_key="H100_80GB_FP8_TP1",
        decode_running_tokens_cap=7.5e5,
        prefill_running_tokens_cap= 1.0e5
    ),
}

SELECTED_SCENARIO = "llama31_8b_fp8_tp1"
CONFIG = EVAL_MODEL_BENCHMARK_CONFIGURATION[SELECTED_SCENARIO]
print(f"Scenario: {SELECTED_SCENARIO}")
print(f"running on model  configuration: {get_model_config(CONFIG.model_name)}")
print(f"running on hardware: {get_hardware_config(CONFIG.hardware_key)}")


Scenario: llama31_8b_fp8_tp1
running on model  configuration: ModelConfig(hidden_size=4096, num_layers=32, expansion_ratio=3.5, model_size=8000000000.0)
running on hardware: HardwareConfig(flops_per_second=3958000000000000.0, memory_bandwidth=3350000000000.0, dtype_bytes=1.0, activation_io_multiplier=12.0, PCIe_bandwidth=120000000000.0, HBM_size=85900000000.0, gpu_count=1)


## Load configuration and raw measurements

In [21]:

df_raw = pd.read_csv(CONFIG.data_path)

hardware_values = df_raw["hardware"].unique()
if len(hardware_values) != 1 or hardware_values[0] != CONFIG.csv_hardware_label:
    raise ValueError(
        f"Unexpected hardware labels {hardware_values}. Expected '{CONFIG.csv_hardware_label}'."
    )

df_raw["hardware_key"] = CONFIG.hardware_key
model_cfg = get_model_config(CONFIG.model_name)
hardware_cfg = get_hardware_config(CONFIG.hardware_key)

print(f"Loaded rows: {len(df_raw)}")
display(df_raw.head())


Loaded rows: 40


Unnamed: 0,version,hardware,hardware_count,data_type,input_tokens,output_tokens,concurrency,TTFT_ms,ITL_ms,Throughput_tokens_per_s,hardware_key
0,1.8.0,H100_80G,1,fp8 TP1,200,200,1,10.26,4.63,214.56,H100_80GB_FP8_TP1
1,1.8.0,H100_80G,1,fp8 TP1,200,200,5,16.81,4.58,1076.88,H100_80GB_FP8_TP1
2,1.8.0,H100_80G,1,fp8 TP1,200,200,25,36.34,5.19,4678.88,H100_80GB_FP8_TP1
3,1.8.0,H100_80G,1,fp8 TP1,200,200,50,67.41,5.91,8018.23,H100_80GB_FP8_TP1
4,1.8.0,H100_80G,1,fp8 TP1,200,200,100,119.97,7.57,12214.97,H100_80GB_FP8_TP1



## Compute analytical estimates

For each scenario we compute the prefill (TTFT) and decode (ITL) bottlenecks, keep both compute/memory components, and convert them to milliseconds.


In [22]:
from utils.parameter_fit import (
    TUNABLE_PARAMETER_NAMES,
    fit_prefill_decode_parameters,
)

BEST_FIT_PARAMS, FIT_REPORT = fit_prefill_decode_parameters(
    df_raw,
    model_cfg,
    hardware_cfg,
    CONFIG,
    tunable_params=TUNABLE_PARAMETER_NAMES,
)

display(FIT_REPORT["parameter_summary"])
display(FIT_REPORT["error_summary"])
display(FIT_REPORT["loss_history"])


divide by zero encountered in log


divide by zero encountered in log



Unnamed: 0,function,parameter,baseline,optimized,delta_pct
0,total_prefill_time,running_tokens_cap,100000.0,100000.0,1.455192e-14
1,total_prefill_time,prefill_mult_factor,1.0,0.582828,-41.71722
2,decode_memory_time,running_tokens_cap,750000.0,181199.770575,-75.84003
3,decode_memory_time,decode_time_min,0.0,0.0,


Unnamed: 0,label,loss,ttft_mape_pct,ttft_rmse_ms,itl_mape_pct,itl_rmse_ms
0,baseline,0.697406,72.184411,49996.648383,65.71003,15.077534
1,optimized,0.447189,77.663009,52717.051868,38.984766,10.943172


Unnamed: 0,iteration,loss
0,0,0.697406
1,1,0.688300
2,2,0.679697
3,3,0.671619
4,4,0.664083
...,...,...
38,38,0.447189
39,39,0.447189
40,40,0.447189
41,41,0.447189


In [23]:
prefill_param_values = BEST_FIT_PARAMS.get("total_prefill_time", {})
decode_param_values = BEST_FIT_PARAMS.get("decode_memory_time", {})

def estimate_row(row):
    S = float(row["concurrency"])
    L_prompt = float(row["input_tokens"])
    L_decode = L_prompt + float(row["output_tokens"])

    prefill_compute = prefill_compute_time(
        S,
        L_prompt,
        model_cfg,
        hardware_cfg,
    ) 
    prefill_memory = prefill_memory_time(S, L_prompt, model_cfg, hardware_cfg)
    prefill_memory_HBM_wall = prefill_memory_HBM_wall_time(S, L_prompt, model_cfg, hardware_cfg)
    # best fitting parameters - prefill_mult_factor, offload_mult_factor, prefill_running_tokens_cap
    total_prefill = total_prefill_time(S, L_prompt, model_cfg, hardware_cfg,
        running_tokens_cap = CONFIG.prefill_running_tokens_cap, 
        prefill_mult_factor = 1.5,
        offload_mult_factor = 1,  
        )
    decode_compute = decode_compute_time(S, L_decode, model_cfg, hardware_cfg)
    # best fitting params - running
    decode_memory = decode_memory_time(
        S,
        L_decode,
        model_cfg,
        hardware_cfg,
        running_tokens_cap= CONFIG.decode_running_tokens_cap, # decode_param_values.get("running_tokens_cap", float(CONFIG.decode_running_tokens_cap)),
        decode_time_min=0.005, 
        bytes_mult_factor=0.4,
    )

    ttft_compute_ms = float(prefill_compute * 1e3)
    ttft_memory_ms = float(prefill_memory * 1e3)
    ttft_memory_wall_HBM_ms = float(prefill_memory_HBM_wall * 1e3)
    total_prefill_ms = float(total_prefill * 1e3)
    itl_compute_ms = float(decode_compute * 1e3)
    itl_memory_ms = float(decode_memory * 1e3)

    decode_running_tokens = S * L_decode
    prefill_running_tokens = S * L_prompt

    return pd.Series({
        "prefill_compute_ms": ttft_compute_ms,
        "prefill_memory_ms": ttft_memory_ms,
        "prefill_memory_HBM_wall_ms": ttft_memory_wall_HBM_ms,
        "ttft_model_ms": total_prefill_ms, 
        "prefill_running_tokens": min(prefill_running_tokens, float(CONFIG.prefill_running_tokens_cap)),
        "decode_compute_ms": itl_compute_ms,
        "decode_memory_ms": itl_memory_ms,
        "itl_model_ms": max(itl_compute_ms, itl_memory_ms),
        "decode_running_tokens": min(decode_running_tokens, float(CONFIG.decode_running_tokens_cap)),
    })



df_eval = df_raw.join(df_raw.apply(estimate_row, axis=1))
df_eval["ttft_delta_ms"] = df_eval["ttft_model_ms"] - df_eval["TTFT_ms"]
df_eval["itl_delta_ms"] = df_eval["itl_model_ms"] - df_eval["ITL_ms"]
df_eval["ttft_memory_wall_vs_compute_ratio"] = df_eval["prefill_memory_HBM_wall_ms"] / df_eval["ttft_model_ms"]

df_eval["ttft_rel_error_pct"] = (df_eval["ttft_delta_ms"] / df_eval["TTFT_ms"]) * 100
df_eval["itl_rel_error_pct"] = (df_eval["itl_delta_ms"] / df_eval["ITL_ms"]) * 100

df_eval = df_eval.sort_values(["input_tokens", "concurrency"]).reset_index(drop=True)
display(df_eval[[
    "input_tokens", "concurrency", "decode_running_tokens",
    "TTFT_ms", "ttft_model_ms", "ttft_delta_ms", "prefill_memory_HBM_wall_ms", "ttft_memory_wall_vs_compute_ratio",
    "ITL_ms", "itl_model_ms", "itl_delta_ms",
]])


Unnamed: 0,input_tokens,concurrency,decode_running_tokens,TTFT_ms,ttft_model_ms,ttft_delta_ms,prefill_memory_HBM_wall_ms,ttft_memory_wall_vs_compute_ratio,ITL_ms,itl_model_ms,itl_delta_ms
0,200,1,400.0,10.26,0.903185,-9.356815,0.000000,0.000000,4.63,5.717883,1.087883
1,200,5,2000.0,16.81,4.515924,-12.294076,0.000000,0.000000,4.58,5.768841,1.188841
2,200,25,10000.0,36.34,22.579619,-13.760381,0.000000,0.000000,5.19,6.023629,0.833629
3,200,50,20000.0,67.41,45.159238,-22.250762,0.000000,0.000000,5.91,6.342115,0.432115
4,200,100,40000.0,119.97,90.318476,-29.651524,0.000000,0.000000,7.57,6.979085,-0.590915
...,...,...,...,...,...,...,...,...,...,...,...
35,20000,50,750000.0,11071.41,11520.804100,449.394100,3070.733333,0.266538,30.73,29.188195,-1.541805
36,20000,100,750000.0,78187.07,24339.941534,-53847.128466,7439.800000,0.305662,31.08,29.188195,-1.891805
37,20000,150,750000.0,145290.54,37159.078968,-108131.461032,11808.866667,0.317792,31.31,29.188195,-2.121805
38,20000,200,750000.0,211735.71,49978.216402,-161757.493598,16177.933333,0.323700,31.46,29.188195,-2.271805


In [24]:
# save results 
eval_folder = "evaluations_results"
df_eval.to_csv(f"{eval_folder}/eval_benchmark-{SELECTED_SCENARIO}.csv")


## Plots

The helpers below make it easy to compare measured vs. modelled values.


In [25]:
heatmap_metrics = df_eval[
    ["input_tokens", "concurrency", "ttft_rel_error_pct", "itl_rel_error_pct"]
].copy()

if heatmap_metrics.duplicated(subset=["input_tokens", "concurrency"]).any():
    raise ValueError("Duplicate input_tokens/concurrency rows detected; heatmap expects unique combinations.")

# Heatmaps over L (prompt tokens) and S (concurrency) to visualize model accuracy without aggregation.
ttft_heatmap_data = (
    heatmap_metrics
    .set_index(["input_tokens", "concurrency"])['ttft_rel_error_pct']
    .unstack("concurrency")
    .sort_index(axis=0)
    .sort_index(axis=1)
)

fig_ttft_heatmap = px.imshow(
    ttft_heatmap_data,
    labels={"x": "Concurrency (S)", "y": "Prompt tokens (L)", "color": "TTFT relative error (%)"},
    title="TTFT relative error heatmap",
    aspect="auto",
    text_auto=".1f",
)
fig_ttft_heatmap.update_xaxes(type="category")
fig_ttft_heatmap.update_yaxes(type="category")
fig_ttft_heatmap.show()

itl_heatmap_data = (
    heatmap_metrics
    .set_index(["input_tokens", "concurrency"])['itl_rel_error_pct']
    .unstack("concurrency")
    .sort_index(axis=0)
    .sort_index(axis=1)
)

fig_itl_heatmap = px.imshow(
    itl_heatmap_data,
    labels={"x": "Concurrency (S)", "y": "Prompt tokens (L)", "color": "ITL relative error (%)"},
    title="ITL relative error heatmap",
    aspect="auto",
    text_auto=".1f",
)
fig_itl_heatmap.update_xaxes(type="category")
fig_itl_heatmap.update_yaxes(type="category")
fig_itl_heatmap.show()


In [26]:

def plot_metric_vs_concurrency(metric_name, observed_col, model_col, prompt_tokens):
    subset = df_eval[df_eval["input_tokens"] == prompt_tokens].sort_values("concurrency")
    if subset.empty:
        raise ValueError(f"No rows with input_tokens={prompt_tokens}")
    melted = subset.melt(
        id_vars=["concurrency"],
        value_vars=[observed_col, model_col],
        var_name="series",
        value_name="value",
    )

    title = f"{metric_name} vs concurrency (prompt={prompt_tokens})"
    fig = px.line(melted, x="concurrency", y="value", color="series", markers=True, title=title)
    fig.update_layout(xaxis_title="Concurrency", yaxis_title=f"{metric_name} [ms]")
    fig.show()


for prompt_tokens in sorted(df_eval["input_tokens"].unique()):
    plot_metric_vs_concurrency("TTFT_ms", "TTFT_ms", "ttft_model_ms", prompt_tokens)

for prompt_tokens in sorted(df_eval["input_tokens"].unique()):
    plot_metric_vs_concurrency("ITL_ms", "ITL_ms", "itl_model_ms", prompt_tokens)


In [27]:

context_batches = sorted(df_eval['concurrency'].unique())


def plot_metric_vs_context(metric_name, observed_col, model_col, concurrency):
    subset = df_eval[df_eval['concurrency'] == concurrency].sort_values('input_tokens')
    if subset.empty:
        raise ValueError(f'No rows with concurrency={concurrency}')
    melted = subset.melt(
        id_vars=['input_tokens'],
        value_vars=[observed_col, model_col],
        var_name='series',
        value_name='value',
    )
    title = f"{metric_name} vs context length (concurrency={concurrency})"
    fig = px.line(
        melted,
        x='input_tokens',
        y='value',
        color='series',
        markers=True,
        title=title,
    )
    fig.update_layout(xaxis_title='Prompt tokens', yaxis_title=f'{metric_name} [ms]')
    fig.show()


for concurrency in context_batches:
    plot_metric_vs_context('TTFT_ms', 'TTFT_ms', 'ttft_model_ms', concurrency)

for concurrency in context_batches:
    plot_metric_vs_context('ITL_ms', 'ITL_ms', 'itl_model_ms', concurrency)



---

Adjust `SELECTED_SCENARIO` or extend `EVAL_MODEL_BENCHMARK_CONFIGURATION` as additional benchmark files become available.
