
# Step 5: Measure Inference Performance of Quantized ONNX Model on CPU

This notebook benchmarks the inference performance of different ONNX model variants on CPU, including:
- Original FP32 model
- Dynamically quantized INT8 model
- Statically quantized INT8 model

Metrics include:
- Average latency per sample (ms)
- Throughput (samples/sec)
- Model size (MB)


In [None]:

!pip install onnxruntime matplotlib --quiet


In [None]:

import os
import time
import numpy as np
import matplotlib.pyplot as plt
import onnxruntime as ort


In [None]:

MODELS = {
    "FP32": "ssept.onnx",
    "INT8-Dynamic": "ssept_quant_dynamic.onnx",
    "INT8-Static": "ssept_quant_static.onnx"
}

INPUT_DIM = 768   # Change to match your model's input dimension
BATCH_SIZE = 128


In [None]:

def benchmark_model(model_path, input_dim=768, batch_size=128):
    if not os.path.exists(model_path):
        print(f"Model not found: {model_path}")
        return None

    session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])
    input_name = session.get_inputs()[0].name

    # Latency test (single sample)
    latencies = []
    for _ in range(100):
        dummy_input = np.random.rand(1, input_dim).astype("float32")
        start = time.time()
        _ = session.run(None, {input_name: dummy_input})
        latencies.append(time.time() - start)

    # Throughput test (batch inference)
    batch_input = np.random.rand(batch_size, input_dim).astype("float32")
    start = time.time()
    for _ in range(20):
        _ = session.run(None, {input_name: batch_input})
    total_time = time.time() - start
    throughput = (batch_size * 20) / total_time

    return {
        "latency_ms": np.mean(latencies) * 1000,
        "p95_latency_ms": np.percentile(latencies, 95) * 1000,
        "throughput": throughput,
        "model_size_MB": os.path.getsize(model_path) / 1e6
    }


In [None]:

results = {}
for name, path in MODELS.items():
    print(f"Testing {name} model...")
    results[name] = benchmark_model(path, INPUT_DIM, BATCH_SIZE)


In [None]:

import pandas as pd
df = pd.DataFrame(results).T.round(2)
df = df[["model_size_MB", "latency_ms", "p95_latency_ms", "throughput"]]
df.columns = ["Model Size (MB)", "Mean Latency (ms)", "P95 Latency (ms)", "Throughput (samples/sec)"]
df


In [None]:

df.plot(kind="bar", figsize=(12, 6), rot=0, title="ONNX Model Inference Performance on CPU")
plt.ylabel("Value")
plt.grid(True)
plt.tight_layout()
plt.show()
