## Apply optimizations to ONNX model

Now that we have an ONNX model, we can apply some basic optimizations. 

In [None]:
import os
import time
import numpy as np
import pandas as pd
import torch
import onnx
import onnxruntime as ort
from torch.utils.data import Dataset, DataLoader

from utilities import pad_or_truncate

In [None]:
# Prepare test dataset
SEQ_LEN = 50  # 或模型导出时的 seq_max_len
class MovieLensTestDataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        self.data = self.df.to_dict(orient="records")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        user_id = int(row["user_id"])
        sequence = eval(row["sequence"]) if isinstance(row["sequence"], str) else row["sequence"]
        sequence = pad_or_truncate(sequence, SEQ_LEN)
        return user_id, sequence

movielens_data_dir = os.getenv("MOVIELENS_DATA_DIR", "/mnt/data")
test_dataset = MovieLensTestDataset(os.path.join(movielens_data_dir, "test.csv"))
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
def benchmark_session(ort_session):
    print(f"Execution provider: {ort_session.get_providers()}")

    user_input_name = ort_session.get_inputs()[0].name
    seq_input_name = ort_session.get_inputs()[1].name

    # Benchmark accuracy
    correct, total = 0, 0
    for user_ids, sequences in test_loader:
        u = np.array(user_ids)
        s = np.stack(sequences)
        outputs = ort_session.run(None, {user_input_name: u, seq_input_name: s})[0]
        preds = np.argmax(outputs, axis=1)
        total += len(preds)
        correct += len(preds)  

    accuracy = 100 * correct / total
    print(f"Accuracy (dummy): {accuracy:.2f}%")

    # Benchmark inference latency 
    user, seq = test_dataset[0]
    u = np.array([user])
    s = np.array([seq])

    ort_session.run(None, {user_input_name: u, seq_input_name: s})  # warmup

    latencies = []
    for _ in range(100):
        start = time.time()
        ort_session.run(None, {user_input_name: u, seq_input_name: s})
        latencies.append(time.time() - start)

    print(f"Inference Latency (median): {np.percentile(latencies, 50)*1000:.2f} ms")
    print(f"Inference Latency (95th): {np.percentile(latencies, 95)*1000:.2f} ms")
    print(f"Inference Latency (99th): {np.percentile(latencies, 99)*1000:.2f} ms")
    print(f"Inference Throughput (single sample): {100/np.sum(latencies):.2f} FPS")

    #  Benchmark batch throughput
    user_ids, sequences = next(iter(test_loader))
    u = np.array(user_ids)
    s = np.stack(sequences)

    ort_session.run(None, {user_input_name: u, seq_input_name: s})  # warmup

    batch_times = []
    for _ in range(50):
        start = time.time()
        ort_session.run(None, {user_input_name: u, seq_input_name: s})
        batch_times.append(time.time() - start)

    batch_fps = (len(user_ids) * 50) / np.sum(batch_times)
    print(f"Batch Throughput: {batch_fps:.2f} FPS")


### Apply basic graph optimizations


In [None]:
onnx_model_path = "models/SSE_PT10kemb.onnx"
optimized_model_path = "models/SSE_PT10kemb_optimized.onnx"

session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
session_options.optimized_model_filepath = optimized_model_path

ort_session = ort.InferenceSession(
    onnx_model_path,
    sess_options=session_options,
    providers=['CPUExecutionProvider']
)

print(f"Optimized ONNX model saved to {optimized_model_path}")


In [None]:
optimized_session = ort.InferenceSession(optimized_model_path, providers=['CPUExecutionProvider'])
benchmark_session(optimized_session)

### Dynamic quantization

In [None]:
import neural_compressor
from neural_compressor import quantization

In [None]:
fp32_model_path = "models/SSE_PT10kemb.onnx"
model = ONNXModel(fp32_model_path)

config = PostTrainingQuantConfig(
    approach="dynamic"
)

q_model = quantization.fit(
    model=model,
    conf=config
)

quantized_model_path = "models/SSE_PT10kemb_quant_dynamic.onnx"
q_model.save_model_to_file(quantized_model_path)
print(f"Quantized model saved to {quantized_model_path}")

# compare
model_size = os.path.getsize(quantized_model_path)
print(f"Quantized Model Size on Disk: {model_size / 1e6:.2f} MB")

ort_session = ort.InferenceSession(quantized_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


#### Static quantization

Next, we will try static quantization with a calibration dataset.


In [None]:
import neural_compressor
from neural_compressor import quantization
from torchvision import datasets, transforms

In [None]:
SEQ_LEN = 50
class MovieLensTestDataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        self.data = self.df.to_dict(orient="records")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        user_id = int(row["user_id"])
        sequence = eval(row["sequence"]) if isinstance(row["sequence"], str) else row["sequence"]
        sequence = pad_or_truncate(sequence, SEQ_LEN)
        return {"user": user_id, "sequence": sequence}

movielens_data_dir = os.getenv("MOVIELENS_DATA_DIR", "/mnt/data")
val_dataset = MovieLensTestDataset(os.path.join(movielens_data_dir, "validation.csv"))
eval_dataloader = INCDataLoader(framework="onnxruntime", dataset=val_dataset)

fp32_model_path = "models/SSE_PT10kemb.onnx"
model = ONNXModel(fp32_model_path)

 ### Aggressive Static Quantization

In [None]:
config_aggressive = PostTrainingQuantConfig(
    accuracy_criterion=AccuracyCriterion(
        criterion="absolute",
        tolerable_loss=0.05
    ),
    approach="static",
    device="cpu",
    quant_level=1,
    quant_format="QOperator",
    recipes={"graph_optimization_level": "ENABLE_EXTENDED"},
    calibration_sampling_size=128
)

q_model_aggressive = quantization.fit(
    model=model,
    conf=config_aggressive,
    calib_dataloader=eval_dataloader,
    eval_dataloader=eval_dataloader,
    eval_metric=Metric(name="topk")  # 可用 top1/top5
)

quant_model_path = "models/SSE_PT10kemb_quant_static_aggressive.onnx"
q_model_aggressive.save_model_to_file(quant_model_path)
print(f"Aggressively Quantized model saved to {quant_model_path}")
print(f"Model Size: {os.path.getsize(quant_model_path) / 1e6:.2f} MB")

# Benchmark
ort_session = ort.InferenceSession(quant_model_path, providers=["CPUExecutionProvider"])
benchmark_session(ort_session)


 ### Conservative Static Quantization

In [None]:
config_conservative = PostTrainingQuantConfig(
    accuracy_criterion=AccuracyCriterion(
        criterion="absolute",
        tolerable_loss=0.01
    ),
    approach="static",
    device="cpu",
    quant_level=0,
    quant_format="QOperator",
    recipes={"graph_optimization_level": "ENABLE_EXTENDED"},
    calibration_sampling_size=128
)

q_model_conservative = quantization.fit(
    model=model,
    conf=config_conservative,
    calib_dataloader=eval_dataloader,
    eval_dataloader=eval_dataloader,
    eval_metric=Metric(name="topk")
)

quant_model_path = "models/SSE_PT10kemb_quant_static_conservative.onnx"
q_model_conservative.save_model_to_file(quant_model_path)
print(f"Conservatively Quantized model saved to {quant_model_path}")
print(f"Model Size: {os.path.getsize(quant_model_path) / 1e6:.2f} MB")

# Benchmark
ort_session = ort.InferenceSession(quant_model_path, providers=["CPUExecutionProvider"])
benchmark_session(ort_session)
