# 准备

> 检查huggingface是否支持Model: https://huggingface.co/docs/optimum/exporters/onnx/overview

In [None]:
%pip install optimum[exporters]
%pip install --upgrade --upgrade-strategy eager optimum[onnxruntime]

# 使用 optimum.onnxruntime 将 huggingFace Transformers 模型导出为 ONNX

In [1]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
save_directory = "tmp/onnx/"

# Load a model from transformers and export it to ONNX
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)

# Save the ONNX model and tokenizer
ort_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

  from .autonotebook import tqdm as notebook_tqdm


('tmp/onnx/tokenizer_config.json',
 'tmp/onnx/special_tokens_map.json',
 'tmp/onnx/vocab.txt',
 'tmp/onnx/added_tokens.json',
 'tmp/onnx/tokenizer.json')

# [option] cpu推理加速: onnx-model转quantization-onnx-model
量化（Quantization） 是将 浮点数（FP32） 转换为 低精度整数（如 INT8、INT16），从而减少 模型大小，提高 计算速度，特别适用于 边缘设备和服务器。

动态量化（Dynamic Quantization） 的特点：

- 只量化权重，激活值（activation）仍然使用浮点计算。
- 在推理时，动态地 将 FP32 激活值转换为 INT8，然后计算，再转换回 FP32。
- 适用于 Transformer、BERT、LSTM 等 全连接层（MatMul、GEMM）较多 的模型。
- 优点：推理速度显著提高，精度损失小。

In [None]:
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from optimum.onnxruntime import ORTQuantizer

# Define the quantization methodology
qconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False) #cpu: Intel Core i7-10750H
quantizer = ORTQuantizer.from_pretrained(ort_model)

# Apply dynamic quantization on the model
quantizer.quantize(save_dir=save_directory, quantization_config=qconfig)

WindowsPath('tmp/onnx')

# [option] 模型优化:ORTOptimizer

## 现成优化参数: AutoOptimizationConfig

In [None]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime import AutoOptimizationConfig

optimization_config = AutoOptimizationConfig.O2()
optimizer = ORTOptimizer.from_pretrained("tmp/onnx") #tmp/onnx中的model.onnx
optimizer.optimize(save_dir="tmp/onnx", optimization_config=optimization_config)

WindowsPath('tmp/onnx')

## 自定义优化参数: OptimizationConfig

In [None]:
from optimum.onnxruntime import  OptimizationConfig

optimization_config = OptimizationConfig(
    optimization_level=2,
    enable_transformers_specific_optimizations=True,
    optimize_for_gpu=False,
)
optimizer = ORTOptimizer.from_pretrained("tmp/onnx") #tmp/onnx中的model.onnx
optimizer.optimize(save_dir="tmp/onnx",file_suffix="customOptimized", optimization_config=optimization_config)



WindowsPath('tmp/onnx')

# 调用onnx-model

## 调用本地的onnx-model(推荐)

In [2]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer

model = ORTModelForSequenceClassification.from_pretrained("tmp/onnx/", file_name="model_quantized.onnx")
tokenizer = AutoTokenizer.from_pretrained("tmp/onnx/")
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
results = classifier("I love burritos!")
results

Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9997308850288391}]

## 调用huggingface的ortmodel
下载官方导出的model.onnx后调用

In [21]:
from optimum.pipelines import pipeline
#huggingface没有distilbert-base-uncased-finetuned-sst-2-english,改为调用deepset/roberta-base-squad2的ortmodel作为示例
onnx_qa = pipeline("question-answering", model="deepset/roberta-base-squad2", accelerator="ort")
question = "What's my name?"
context = "My name is Philipp and I live in Nuremberg."

pred = onnx_qa(question=question, context=context)
pred

Device set to use cuda:0


{'score': 0.9041659235954285, 'start': 11, 'end': 18, 'answer': 'Philipp'}

# tmp/onnx文件结构

In [12]:
import os

def display_file_tree(directory, indent=0):
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        print("  " * indent + "|-- " + item)
        if os.path.isdir(item_path):
            display_file_tree(item_path, indent + 1)

display_file_tree("tmp/onnx")

|-- config.json
|-- model.onnx
|-- model_customOptimized.onnx
|-- model_optimized.onnx
|-- model_quantized.onnx
|-- ort_config.json
|-- special_tokens_map.json
|-- tokenizer.json
|-- tokenizer_config.json
|-- vocab.txt
