# Step 0: Export PyTorch Model to ONNX
In this step, we export the trained SSE-PT PyTorch model to ONNX format using `torch.onnx.export`. This ONNX model will be used for all downstream optimizations.

In [None]:
import torch
import os
import numpy as np
from model.ssept import SSEPTModel  # adjust this import to match your project

# Load trained model
model = SSEPTModel()
model.load_state_dict(torch.load("ssept.pt", map_location=torch.device("cpu")))
model.eval()

# Prepare dummy input - must match actual input shape
dummy_input = torch.randn(1, 768)

# Export to ONNX
onnx_output_path = "ssept.onnx"
torch.onnx.export(
    model,
    dummy_input,
    onnx_output_path,
    input_names=["input_ids"],
    output_names=["output"],
    dynamic_axes={"input_ids": {0: "batch_size"}},
    opset_version=13
)
print(f"Model exported to {onnx_output_path}")

# Step 1: Optimize ONNX Graph
This step applies graph-level optimizations such as fusing adjacent operations (e.g., Conv + BN) to accelerate inference.

# 1. Optimize Graph for SSE-PT
This notebook loads your exported SSE-PT ONNX model, applies ONNX Runtime graph optimizations, and saves an optimized model for serving.

In [None]:

import onnx
from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel

# Path to your ONNX model
input_model_path = "model/ssept.onnx"
output_model_path = "model/ssept_optimized.onnx"

# Load and check ONNX model
model = onnx.load(input_model_path)
onnx.checker.check_model(model)
print("Loaded and checked model successfully.")


In [None]:

# Create session with optimization enabled
opt = SessionOptions()
opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

# If using GPU later, you may switch to CUDAExecutionProvider or TensorrtExecutionProvider
sess = InferenceSession(input_model_path, sess_options=opt, providers=["CPUExecutionProvider"])

# Save optimized model
optimized_model_bytes = sess.serialize()
with open(output_model_path, "wb") as f:
    f.write(optimized_model_bytes)
print(f"Optimized model saved to {output_model_path}")


# Quantize ONNX Model for SSE-PT (Movie Recommendation)

# Step 2: Apply Dynamic Quantization
Here we convert weights from float32 to int8 using ONNX Runtime's dynamic quantization, targeting CPU deployment.

In [None]:

# Quantize ONNX Model for SSE-PT (Movie Recommendation)
# Author: Abby Zhou Team 37

## 0. Install Dependencies
!pip install onnx onnxruntime onnxruntime-tools neural-compressor --quiet

## 1. Import Libraries
import os
import numpy as np
import time
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig

## 2. Path Configuration
onnx_model_path = "ssept.onnx"
quant_model_path = "ssept_quant_dynamic.onnx"
assert os.path.exists(onnx_model_path), "Please export ONNX model first."

## 3. Model Size Before Quantization
print("Original FP32 model size:", os.path.getsize(onnx_model_path)/1e6, "MB")

## 4. Quantization: Dynamic INT8 Weights
quantize_dynamic(
    model_input=onnx_model_path,
    model_output=quant_model_path,
    weight_type=QuantType.QInt8,
    optimize_model=True
)
print("Quantized model size:", os.path.getsize(quant_model_path)/1e6, "MB")

## 5. Inference Session with Quantized Model
session = ort.InferenceSession(quant_model_path, providers=["CPUExecutionProvider"])
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name

## 6. Inference Latency Benchmark (Single Sample)
dummy_input = np.random.rand(1, 768).astype("float32")  # adjust input size
latencies = []
for _ in range(100):
    start = time.time()
    _ = session.run(None, {input_name: dummy_input})
    latencies.append(time.time() - start)
print("Average latency:", np.mean(latencies)*1000, "ms")

## 7. Summary
print("Quantized ONNX Model:", quant_model_path)
print("Model size:", round(os.path.getsize(quant_model_path)/1e6, 2), "MB")
print("Mean inference latency:", round(np.mean(latencies)*1000, 2), "ms")


# Optimize & Statically Quantize ONNX Model for SSE-PT

In [None]:

!pip install onnx onnxruntime onnxruntime-tools neural-compressor --quiet


In [None]:

import os
import numpy as np
import time
import onnx
import onnxruntime as ort
from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig
from neural_compressor.data import DATALOADERS
from onnxruntime_tools import optimizer


# Step 3: Apply Static Quantization
This step uses Neural Compressor with a dummy calibration dataset to perform static quantization for even more compact models.

In [None]:

onnx_fp32_path = "ssept.onnx"
onnx_optimized_path = "ssept_optimized.onnx"
onnx_quant_path = "ssept_quant_static.onnx"
assert os.path.exists(onnx_fp32_path), "Run 0_export_onnx.ipynb first."


In [None]:

from onnxruntime_tools import optimizer
optimized_model = optimizer.optimize_model(onnx_fp32_path, model_type='bert')  # Assuming SSE-PT is transformer-based
optimized_model.save_model_to_file(onnx_optimized_path)
print("Optimized model saved to:", onnx_optimized_path)


In [None]:

# Define dummy calibration dataloader (use real user vectors if possible)
class DummyDataset:
    def __iter__(self):
        for _ in range(10):
            yield {"input_ids": np.random.rand(1, 768).astype("float32")}  # Adjust key if needed

calib_dataloader = DATALOADERS["onnxruntime"](DummyDataset())

quant_config = PostTrainingQuantConfig(approach="static", calibrate_sampling_size=10)
quantized_model = quantization.fit(
    model=onnx_optimized_path,
    conf=quant_config,
    calib_dataloader=calib_dataloader
)
quantized_model.save_model_to_file(onnx_quant_path)
print("Quantized model saved to:", onnx_quant_path)


In [None]:

session = ort.InferenceSession(onnx_quant_path, providers=["CPUExecutionProvider"])
input_name = session.get_inputs()[0].name
dummy_input = np.random.rand(1, 768).astype("float32")
latencies = []
for _ in range(100):
    start = time.time()
    _ = session.run(None, {input_name: dummy_input})
    latencies.append(time.time() - start)
print("Average latency (static quant):", round(np.mean(latencies)*1000, 2), "ms")
print("Model size:", round(os.path.getsize(onnx_quant_path)/1e6, 2), "MB")


# 4. Inference Performance Evaluation for Quantized SSE-PT Model

In [None]:

!pip install onnx onnxruntime matplotlib --quiet


In [None]:

import os
import numpy as np
import time
import matplotlib.pyplot as plt
import onnxruntime as ort


In [None]:

# Choose which model to evaluate
model_path = "ssept_quant_dynamic.onnx"  # Or use ssept_quant_static.onnx
assert os.path.exists(model_path), "Quantized ONNX model not found."

# Create inference session
session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])
input_name = session.get_inputs()[0].name
print("Input name:", input_name)


In [None]:

# Simulate a batch of 128 users for throughput testing
BATCH_SIZE = 128
DIMS = 768  # Replace with your model's input dimension
inputs = np.random.rand(BATCH_SIZE, DIMS).astype("float32")


In [None]:

latencies = []
for _ in range(100):
    sample = np.random.rand(1, DIMS).astype("float32")
    start = time.time()
    _ = session.run(None, {input_name: sample})
    latencies.append(time.time() - start)

mean_latency = np.mean(latencies) * 1000
p95_latency = np.percentile(latencies, 95) * 1000

print(f"Mean latency: {mean_latency:.2f} ms")
print(f"95th percentile latency: {p95_latency:.2f} ms")


In [None]:

start = time.time()
for _ in range(20):
    _ = session.run(None, {input_name: inputs})
total_time = time.time() - start

throughput = (BATCH_SIZE * 20) / total_time
print(f"Throughput: {throughput:.2f} samples/sec")


In [None]:

plt.hist(np.array(latencies) * 1000, bins=20)
plt.title("Latency Distribution (ms)")
plt.xlabel("Latency (ms)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
