# 1.Hugging Face如何导出并使用模型 

In [40]:
from transformers import AutoTokenizer
import os
os.environ["HF_MODEL_REGISTRY"] = "https://huggingface.co/namespace/model"

In [41]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [42]:
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)

{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [43]:
tokenizer

BertTokenizerFast(name_or_path='nlptown/bert-base-multilingual-uncased-sentiment', vocab_size=105879, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [44]:
tokenizer.decode(encoding['input_ids'])

'[CLS] we are very happy to show you the [UNK] transformers library. [SEP]'

In [45]:
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

In [46]:
pt_batch

{'input_ids': tensor([[  101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103,   100,
         58263, 13299,   119,   102],
        [  101, 11312, 18763, 10855, 11530,   112,   162, 39487, 10197,   119,
           102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [47]:
from transformers import AutoModelForSequenceClassification
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [48]:
pt_outputs = pt_model(**pt_batch)

In [49]:
from torch import nn

pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)

tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)


In [37]:
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)
pt_model.save_pretrained(pt_save_directory)

# 2.如何把模型转换成ONNX格式

In [53]:
input_text = "We are very happy to show you the 🤗 Transformers library."
input_ids = tokenizer.encode(input_text, add_special_tokens=True, return_tensors='pt')

In [54]:
input_ids

tensor([[  101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103,   100,
         58263, 13299,   119,   102]])

In [57]:
import torch
dummy_input = input_ids  # 用于生成ONNX模型的示例输入
torch.onnx.export(pt_model, dummy_input, "model.onnx", input_names=['input_ids'], output_names=['output'])

verbose: False, log level: Level.ERROR



# 3.ONNXRuntime运行起模型

In [58]:
import onnxruntime

# 加载导出的ONNX模型
onnx_model = "model.onnx"
ort_session = onnxruntime.InferenceSession(onnx_model)

# 使用ONNX模型进行推理
input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name
ort_inputs = {input_name: input_ids.numpy()}
ort_outs = ort_session.run([output_name], ort_inputs)
print(ort_outs)

[array([[-2.6222005, -2.7745323, -0.8966625,  2.0137324,  3.306386 ]],
      dtype=float32)]


In [70]:
import onnx

model_path = "model.onnx"
model = onnx.load(model_path)

# 查找模型的输入信息
input_info = model.graph.input[0]
input_name = input_info.name
input_shape = [dim.dim_value for dim in input_info.type.tensor_type.shape.dim]
input_dtype = input_info.type.tensor_type.elem_type

# 查找模型的输出信息
output_info = model.graph.output[0]
output_name = output_info.name
output_shape = [dim.dim_value for dim in output_info.type.tensor_type.shape.dim]
output_dtype = output_info.type.tensor_type.elem_type

print("Input Name:", input_name)
print("Input Shape:", input_shape)
print("Input Data Type:", input_dtype)
print("Output Name:", output_name)
print("Output Shape:", output_shape)
print("Output Data Type:", output_dtype)

Input Name: input_ids
Input Shape: [1, 14]
Input Data Type: 7
Output Name: output
Output Shape: [1, 5]
Output Data Type: 1


# 4.部署Triton服务
```shell
docker run --gpus=all --rm --net=host -v ${PWD}:/models nvcr.io/nvidia/tritonserver:23.04-py3 triton-export --model=/models/model.onnx --output=/models/model_config.pbtxt
```

# 4.使用Client测试Triton服务

In [3]:
import tritonhttpclient

# 创建Triton客户端
triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")

In [None]:
import tritonclient.http as httpclient
import numpy as np

# 构建Triton请求
inputs = []
inputs.append(httpclient.InferInput('input_ids', [len(input_ids)], "INT64"))
inputs[0].set_data_from_numpy(input_ids)

outputs = []
outputs.append(httpclient.InferRequestedOutput('output__0'))