### Try a different execution provider

Once a model is in ONNX format, we can use it with many *execution providers*. In ONNX, an execution provider an interface that lets ONNX models run with special hardware-specific capabilities. Until now, we have been using the `CPUExecutionProvider`, but if we use hardware-specific capabilities, e.g. switch out generic implementations of graph operations for implementations that are optimized for specific hardware, we can execute exactly the same model, much faster.

In [None]:
import os
import numpy as np
import pandas as pd
import onnxruntime as ort
from torch.utils.data import Dataset, DataLoader
from utilities import pad_or_truncate

In [None]:
# Prepare test dataset
import os
import numpy as np
import pandas as pd
import onnxruntime as ort
from torch.utils.data import Dataset, DataLoader
from utilities import pad_or_truncate

SEQ_LEN = 50
class MovieLensTestDataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        self.data = self.df.to_dict(orient="records")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        user_id = int(row["user_id"])
        sequence = eval(row["sequence"]) if isinstance(row["sequence"], str) else row["sequence"]
        sequence = pad_or_truncate(sequence, SEQ_LEN)
        return user_id, sequence

movielens_data_dir = os.getenv("MOVIELENS_DATA_DIR", "/mnt/data")
test_dataset = MovieLensTestDataset(os.path.join(movielens_data_dir, "test.csv"))
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
def benchmark_session(ort_session):
    user_input = ort_session.get_inputs()[0].name
    seq_input = ort_session.get_inputs()[1].name

    print(f"Execution provider: {ort_session.get_providers()}")

    correct, total = 0, 0
    for user_ids, sequences in test_loader:
        u = np.array(user_ids)
        s = np.stack(sequences)
        outputs = ort_session.run(None, {user_input: u, seq_input: s})[0]
        preds = np.argmax(outputs, axis=1)
        total += len(preds)
        correct += len(preds)  # 假设无 label

    print(f"Accuracy (dummy): {correct / total * 100:.2f}%")

    # Benchmark inference latency for single sample
    u, s = test_dataset[0]
    u = np.array([u])
    s = np.array([s])
    ort_session.run(None, {user_input: u, seq_input: s})  # warmup
    latencies = []
    for _ in range(100):
        import time
        start = time.time()
        ort_session.run(None, {user_input: u, seq_input: s})
        latencies.append(time.time() - start)
    print(f"Latency median: {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Latency 95th: {np.percentile(latencies, 95) * 1000:.2f} ms")
    print(f"Latency 99th: {np.percentile(latencies, 99) * 1000:.2f} ms")
    print(f"Inference Throughput: {100 / np.sum(latencies):.2f} FPS")

    # Benchmark batch throughput
    batch_users, batch_seqs = next(iter(test_loader))
    u = np.array(batch_users)
    s = np.stack(batch_seqs)
    ort_session.run(None, {user_input: u, seq_input: s})
    batch_times = []
    for _ in range(50):
        start = time.time()
        ort_session.run(None, {user_input: u, seq_input: s})
        batch_times.append(time.time() - start)
    print(f"Batch Throughput: {(len(u) * 50) / np.sum(batch_times):.2f} FPS")


<!--
Execution provider: ['CPUExecutionProvider']
Accuracy: 90.59% (3032/3347 correct)
Inference Latency (single sample, median): 9.93 ms
Inference Latency (single sample, 95th percentile): 14.20 ms
Inference Latency (single sample, 99th percentile): 14.43 ms
Inference Throughput (single sample): 91.10 FPS
Batch Throughput: 1042.47 FPS
-->

#### Try different Execution Provider 

<!--
Execution provider: ['CUDAExecutionProvider', 'CPUExecutionProvider']
Accuracy: 90.59% (3032/3347 correct)
Inference Latency (single sample, median): 0.89 ms
Inference Latency (single sample, 95th percentile): 0.90 ms
Inference Latency (single sample, 99th percentile): 0.91 ms
Inference Throughput (single sample): 1117.06 FPS
Batch Throughput: 5181.99 FPS
-->

#### CPUExecutionProvider



In [None]:
onnx_model_path = "models/SSE_PT10kemb_quant_static_conservative.onnx"  


In [None]:
print("\n[CPUExecutionProvider]")
session_cpu = ort.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
benchmark_session(session_cpu)

####  OpenVINO



In [None]:
if "OpenVINOExecutionProvider" in ort.get_available_providers():
    print("\n[OpenVINOExecutionProvider]")
    session_ov = ort.InferenceSession(onnx_model_path, providers=["OpenVINOExecutionProvider"])
    benchmark_session(session_ov)

#### TensorRT execution provider



In [None]:
if "TensorrtExecutionProvider" in ort.get_available_providers():
    print("\n[TensorrtExecutionProvider]")
    session_trt = ort.InferenceSession(onnx_model_path, providers=["TensorrtExecutionProvider"])
    benchmark_session(session_trt)

<!--
Execution provider: ['TensorrtExecutionProvider', 'CPUExecutionProvider']
Accuracy: 90.59% (3032/3347 correct)
Inference Latency (single sample, median): 0.63 ms
Inference Latency (single sample, 95th percentile): 0.64 ms
Inference Latency (single sample, 99th percentile): 0.70 ms
Inference Throughput (single sample): 1572.61 FPS
Batch Throughput: 9274.45 FPS
-->