# 4. Inference Performance Evaluation for Quantized SSE-PT Model

In [None]:

!pip install onnx onnxruntime matplotlib --quiet


In [None]:

import os
import numpy as np
import time
import matplotlib.pyplot as plt
import onnxruntime as ort


In [None]:

# Choose which model to evaluate
model_path = "ssept_quant_dynamic.onnx"  # Or use ssept_quant_static.onnx
assert os.path.exists(model_path), "Quantized ONNX model not found."

# Create inference session
session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])
input_name = session.get_inputs()[0].name
print("Input name:", input_name)


In [None]:

# Simulate a batch of 128 users for throughput testing
BATCH_SIZE = 128
DIMS = 768  # Replace with your model's input dimension
inputs = np.random.rand(BATCH_SIZE, DIMS).astype("float32")


In [None]:

latencies = []
for _ in range(100):
    sample = np.random.rand(1, DIMS).astype("float32")
    start = time.time()
    _ = session.run(None, {input_name: sample})
    latencies.append(time.time() - start)

mean_latency = np.mean(latencies) * 1000
p95_latency = np.percentile(latencies, 95) * 1000

print(f"Mean latency: {mean_latency:.2f} ms")
print(f"95th percentile latency: {p95_latency:.2f} ms")


In [None]:

start = time.time()
for _ in range(20):
    _ = session.run(None, {input_name: inputs})
total_time = time.time() - start

throughput = (BATCH_SIZE * 20) / total_time
print(f"Throughput: {throughput:.2f} samples/sec")


In [None]:

plt.hist(np.array(latencies) * 1000, bins=20)
plt.title("Latency Distribution (ms)")
plt.xlabel("Latency (ms)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
