# Benchmark Validation — LLaMA 3.3 70B

Compare measured TTFT/ITL numbers against the analytical estimators for the 2×H100 FP8 benchmark in `tested_benchmarks`.

In [1]:

import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go

from utils.config import get_model_config, get_hardware_config
from utils.math_utils import (
    prefill_compute_time,
    prefill_memory_time,
    decode_compute_time,
    decode_memory_time,
)

pd.options.display.max_rows = 20
pd.options.display.max_columns = 20


## Load configuration and raw measurements

In [2]:

DATA_PATH = Path("tested_benchmarks/nim_llama33_70b_v1.8.0_2xH100_fp8TP2.csv")
MODEL_NAME = "llama33_70B"
HARDWARE_LOOKUP = {
    "H100_80G": "H100_80GB_FP8_TP2",
}

df_raw = pd.read_csv(DATA_PATH)
df_raw["hardware_key"] = df_raw["hardware"].map(HARDWARE_LOOKUP)
if df_raw["hardware_key"].isna().any():
    missing = df_raw[df_raw["hardware_key"].isna()]["hardware"].unique()
    raise ValueError(f"Missing HARDWARE_LOOKUP entries for: {missing}")

model_cfg = get_model_config(MODEL_NAME)
hardware_cfg = get_hardware_config(df_raw["hardware_key"].iat[0])
display(df_raw.head())


Unnamed: 0,version,hardware,hardware count,data_type,input_tokens,output_tokens,concurrency,TTFT_ms,ITL_ms,Throughput_tokens_per_s,hardware_key
0,1.8.0,H100_80G,2,fp8 TP2,5000,500,1,406.77,18.58,51.67,H100_80GB_FP8_TP2
1,1.8.0,H100_80G,2,fp8 TP2,5000,500,5,546.7,21.83,218.48,H100_80GB_FP8_TP2
2,1.8.0,H100_80G,2,fp8 TP2,5000,500,25,688.69,38.75,623.4,H100_80GB_FP8_TP2
3,1.8.0,H100_80G,2,fp8 TP2,5000,500,50,834.51,59.73,814.37,H100_80GB_FP8_TP2
4,1.8.0,H100_80G,2,fp8 TP2,5000,500,100,7996.25,92.91,917.26,H100_80GB_FP8_TP2


## Compute analytical estimates

For each scenario we compute the prefill (TTFT) and decode (ITL) bottlenecks, keep both compute/memory components, and convert them to milliseconds.

In [8]:

def estimate_row(row):
    S = float(row["concurrency"])
    L_prompt = float(row["input_tokens"])
    L_decode = L_prompt + float(row["output_tokens"])

    prefill_compute = prefill_compute_time(S, L_prompt, model_cfg, hardware_cfg)
    prefill_memory = prefill_memory_time(S, L_prompt, model_cfg, hardware_cfg)
    # decode_compute = decode_compute_time(S, L_decode, model_cfg, hardware_cfg)
    # decode_memory = decode_memory_time(S, L_decode, model_cfg, hardware_cfg)
    decode_compute = decode_compute_time(S, L_prompt, model_cfg, hardware_cfg)
    decode_memory = decode_memory_time(S, L_prompt, model_cfg, hardware_cfg)

    ttft_compute_ms = float(prefill_compute * 1e3)
    ttft_memory_ms = float(prefill_memory * 1e3)
    itl_compute_ms = float(decode_compute * 1e3)
    itl_memory_ms = float(decode_memory * 1e3)

    return pd.Series({
        "prefill_compute_ms": ttft_compute_ms,
        "prefill_memory_ms": ttft_memory_ms,
        "ttft_model_ms": max(ttft_compute_ms, ttft_memory_ms),
        "decode_compute_ms": itl_compute_ms,
        "decode_memory_ms": itl_memory_ms,
        "itl_model_ms": max(itl_compute_ms, itl_memory_ms),
    })


df_eval = df_raw.join(df_raw.apply(estimate_row, axis=1))
df_eval["ttft_ratio"] = df_eval["ttft_model_ms"] / df_eval["TTFT_ms"]
df_eval["itl_ratio"] = df_eval["itl_model_ms"] / df_eval["ITL_ms"]
df_eval["ttft_delta_ms"] = df_eval["ttft_model_ms"] - df_eval["TTFT_ms"]
df_eval["itl_delta_ms"] = df_eval["itl_model_ms"] - df_eval["ITL_ms"]

# df_eval = df_eval.sort_values(["input_tokens", "concurrency"]).reset_index(drop=True)
display(df_eval[[
    "input_tokens", "concurrency", "TTFT_ms", "ttft_model_ms", "ttft_delta_ms",
    "ITL_ms", "itl_model_ms", "itl_delta_ms",
]])


Unnamed: 0,input_tokens,concurrency,TTFT_ms,ttft_model_ms,ttft_delta_ms,ITL_ms,itl_model_ms,itl_delta_ms
0,5000,1,406.77,82.882012,-323.887988,18.58,9.793817,-8.786183
1,5000,5,546.70,414.410058,-132.289942,21.83,13.711892,-8.118108
2,5000,25,688.69,2072.050288,1383.360288,38.75,33.302265,-5.447735
3,5000,50,834.51,4144.100576,3309.590576,59.73,57.790232,-1.939768
4,5000,100,7996.25,8288.201152,291.951152,92.91,106.766165,13.856165
...,...,...,...,...,...,...,...,...
34,20000,50,123969.71,21543.759555,-102425.950445,49.22,204.512620,155.292620
35,20000,100,333564.85,43087.519111,-290477.330889,49.81,400.210941,350.400941
36,20000,150,543226.40,64631.278666,-478595.121334,50.01,595.909262,545.899262
37,20000,200,753417.03,86175.038221,-667241.991779,50.06,791.607583,741.547583


## Error summary by prompt length

In [6]:

summary = (
    df_eval
    .groupby("input_tokens")[[
        "TTFT_ms", "ttft_model_ms", "ttft_ratio",
        "ITL_ms", "itl_model_ms", "itl_ratio"
    ]]
    .agg(["mean", "median"])
)
summary


Unnamed: 0_level_0,TTFT_ms,TTFT_ms,ttft_model_ms,ttft_model_ms,ttft_ratio,ttft_ratio,ITL_ms,ITL_ms,itl_model_ms,itl_model_ms,itl_ratio,itl_ratio
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
input_tokens,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
200,163.13375,166.54,293.379558,224.80272,1.39442,1.327159,30.0375,26.6,12.76766,11.851452,0.437916,0.447629
500,842.567143,181.64,572.477245,377.154878,1.608066,1.277365,27.451429,23.97,16.338139,13.773515,0.572865,0.574615
1000,697.2525,306.425,1488.954408,1143.883028,2.677928,2.129607,33.95875,30.04,28.046351,23.589243,0.751227,0.777485
5000,24444.085,4415.38,8091.356375,6216.150864,1.354099,0.557665,63.865,76.295,104.439808,82.278198,1.341296,1.05833
20000,342615.69375,228767.28,42064.190532,32315.639333,0.274625,0.151478,42.78875,49.515,390.915271,302.36178,7.949816,6.094911


## Plots

The helpers below make it easy to compare measured vs. modelled values for any prompt length.

In [14]:

def plot_metric_vs_concurrency(metric_name, observed_col, model_col, prompt_tokens):
    subset = df_eval[df_eval["input_tokens"] == prompt_tokens].sort_values("concurrency")
    if subset.empty:
        raise ValueError(f"No rows with input_tokens={prompt_tokens}")
    melted = subset.melt(
        id_vars=["concurrency"],
        value_vars=[observed_col, model_col],
        var_name="series",
        value_name="value",
    )

    title = f"{metric_name} vs concurrency (prompt={prompt_tokens})"
    fig = px.line(melted, x="concurrency", y="value", color="series", markers=True, title=title)
    fig.update_layout(xaxis_title="Concurrency", yaxis_title=f"{metric_name} [ms]")
    fig.show()

for prompt_tokens in sorted(df_eval["input_tokens"].unique()):
    plot_metric_vs_concurrency("TTFT_ms", "TTFT_ms", "ttft_model_ms", prompt_tokens)

for prompt_tokens in sorted(df_eval["input_tokens"].unique()):
    plot_metric_vs_concurrency("ITL_ms", "ITL_ms", "itl_model_ms", prompt_tokens)


In [None]:

fig_ttft = px.scatter(
    df_eval,
    x="TTFT_ms",
    y="ttft_model_ms",
    color="input_tokens",
    hover_data=["concurrency", "prefill_limit"],
    title="TTFT: measured vs modelled",
)
fig_ttft.add_trace(
    go.Scatter(
        x=[df_eval["TTFT_ms"].min(), df_eval["TTFT_ms"].max()],
        y=[df_eval["TTFT_ms"].min(), df_eval["TTFT_ms"].max()],
        mode="lines",
        name="ideal",
    )
)
fig_ttft.update_layout(xaxis_title="Measured TTFT [ms]", yaxis_title="Model TTFT [ms]")
fig_ttft.show()

fig_itl = px.scatter(
    df_eval,
    x="ITL_ms",
    y="itl_model_ms",
    color="input_tokens",
    hover_data=["concurrency", "decode_limit"],
    title="ITL: measured vs modelled",
)
fig_itl.add_trace(
    go.Scatter(
        x=[df_eval["ITL_ms"].min(), df_eval["ITL_ms"].max()],
        y=[df_eval["ITL_ms"].min(), df_eval["ITL_ms"].max()],
        mode="lines",
        name="ideal",
    )
)
fig_itl.update_layout(xaxis_title="Measured ITL [ms]", yaxis_title="Model ITL [ms]")
fig_itl.show()


---

Adjust `MODEL_NAME`, `HARDWARE_LOOKUP`, or the plotting helpers above to explore additional benchmark files.