## Measure inference performance of ONNX model on low-resource edge device

Now, we’re going to benchmark a couple of previously created ONNX models on our low-resource edge device.

In [None]:
import os
import time
import numpy as np
import onnxruntime as ort

In [None]:
def benchmark_session(ort_session):
    num_trials = 100

    # input info
    input_names = [i.name for i in ort_session.get_inputs()]
    user_input_shape = ort_session.get_inputs()[0].shape  # 通常是 [batch_size]
    seq_input_shape = ort_session.get_inputs()[1].shape   # 通常是 [batch_size, seq_len]

    batch_size = 1
    seq_len = seq_input_shape[1]

    dummy_user = np.random.randint(0, 1000, size=(batch_size,), dtype=np.int64)
    dummy_seq = np.random.randint(0, 1000, size=(batch_size, seq_len), dtype=np.int64)

    # warm-up
    ort_session.run(None, {
        input_names[0]: dummy_user,
        input_names[1]: dummy_seq
    })

    latencies = []
    for _ in range(num_trials):
        start = time.time()
        ort_session.run(None, {
            input_names[0]: dummy_user,
            input_names[1]: dummy_seq
        })
        latencies.append(time.time() - start)

    print(f"Inference Latency (median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Latency (95th): {np.percentile(latencies, 95) * 1000:.2f} ms")
    print(f"Inference Latency (99th): {np.percentile(latencies, 99) * 1000:.2f} ms")
    print(f"Inference Throughput (single sample): {num_trials / np.sum(latencies):.2f} FPS")


Now, let’s evaluate our “baseline” ONNX model:

In [None]:
print("\n[Baseline FP32]")
ort_session = ort.InferenceSession("models/SSE_PT10kemb.onnx", providers=['CPUExecutionProvider'])
benchmark_session(ort_session)

# the model quantized with dynamic quantization:
print("\n[Dynamic Quantized]")
ort_session = ort.InferenceSession("models/SSE_PT10kemb_quant_dynamic.onnx", providers=['CPUExecutionProvider'])
benchmark_session(ort_session)

#  the model quantized with static quantization
print("\n[Static Quantized - Aggressive]")
ort_session = ort.InferenceSession("models/SSE_PT10kemb_quant_static_aggressive.onnx", providers=['CPUExecutionProvider'])
benchmark_session(ort_session)
