In [2]:
! pip install torch transformers onnx onnxruntime onnxruntime-tools

Collecting onnx
  Downloading onnx-1.20.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting onnxruntime-tools
  Downloading onnxruntime_tools-1.7.0-py3-none-any.whl.metadata (14 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting py3nvml (from onnxruntime-tools)
  Downloading py3nvml-0.2.7-py3-none-any.whl.metadata (13 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting xmltodict (from py3nvml->onnxruntime-tools)
  Downloading xmltodict-1.0.2-py3-none-any.whl.metadata (15 kB)
Downloading onnx-1.20.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (18.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import os
import torch
from transformers import AutoTokenizer, AutoModel
from onnxruntime.quantization import quantize_dynamic, QuantType

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
ONNX_DIR = "/content/drive/MyDrive/Colab Notebooks/model_export/onnx"
FP32_MODEL = os.path.join(ONNX_DIR, "model.onnx")
INT8_MODEL = os.path.join(ONNX_DIR, "model-int8.onnx")



In [6]:
!pip install -U onnx onnxscript

Collecting onnxscript
  Downloading onnxscript-0.5.7-py3-none-any.whl.metadata (13 kB)
Collecting onnx_ir<2,>=0.1.12 (from onnxscript)
  Downloading onnx_ir-0.1.13-py3-none-any.whl.metadata (3.2 kB)
Downloading onnxscript-0.5.7-py3-none-any.whl (693 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.4/693.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx_ir-0.1.13-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.1/133.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx_ir, onnxscript
Successfully installed onnx_ir-0.1.13 onnxscript-0.5.7


In [7]:
os.makedirs(ONNX_DIR, exist_ok=True)

print("Loading pretrained MiniLM...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()

# Dummy input for export
dummy_text = "This is a test sentence for ONNX export."
inputs = tokenizer(dummy_text, return_tensors="pt")

print("Exporting FP32 ONNX model...")
torch.onnx.export(
    model,
    (inputs["input_ids"], inputs["attention_mask"]),
    FP32_MODEL,
    input_names=["input_ids", "attention_mask"],
    output_names=["last_hidden_state"],
    dynamic_axes={
        "input_ids": {0: "batch", 1: "sequence"},
        "attention_mask": {0: "batch", 1: "sequence"},
        "last_hidden_state": {0: "batch", 1: "sequence"},
    },
    opset_version=17,
)

print("FP32 ONNX saved:", FP32_MODEL)

print("Quantizing to INT8...")
quantize_dynamic(
    model_input=FP32_MODEL,
    model_output=INT8_MODEL,
    weight_type=QuantType.QInt8,
)

print("INT8 ONNX saved:", INT8_MODEL)

print("Done ✅")


Loading pretrained MiniLM...
Exporting FP32 ONNX model...


  torch.onnx.export(
W1227 08:27:46.693000 693 torch/onnx/_internal/exporter/_compat.py:114] Setting ONNX exporter to use operator set version 18 because the requested opset_version 17 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features


[torch.onnx] Obtain model graph for `BertModel([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `BertModel([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...
[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...




[torch.onnx] Translate the graph into ONNX... ✅
Applied 42 of general pattern rewrite rules.
FP32 ONNX saved: /content/drive/MyDrive/Colab Notebooks/model_export/onnx/model.onnx
Quantizing to INT8...




INT8 ONNX saved: /content/drive/MyDrive/Colab Notebooks/model_export/onnx/model-int8.onnx
Done ✅
