In [2]:
##################################################################################
# 分为三个部分
# 1. tokenizer部分
# 2. transformer部分
# 3. pooling部分


# from multiprocessing.pool import Pool
import numpy as np
import onnxruntime
import psutil
from sympy import im
from transformers import (AutoConfig, AutoModel, AutoTokenizer)
import os
import json
from sentence_transformers.models import Pooling

from sentence_transformers import SentenceTransformer as sbert


In [3]:

##################################################################################
# 处理transformer和 tokenizer部分

big_model_path = "../models/paraphrase-multilingual-MiniLM-L12-v2"

modules_json_path = os.path.join(big_model_path, 'modules.json')
with open(modules_json_path) as fIn:
    modules_config = json.load(fIn)

tf_from_s_path = os.path.join(big_model_path, modules_config[0].get('path'))


# 基本参数

max_seq_length = 128
doc_stride = 128
max_query_length = 64
# Enable overwrite to export onnx model and download latest script each time when running this notebook.
enable_overwrite = True
# Total samples to inference. It shall be large enough to get stable latency measurement.
total_samples = 1000


# # Load pretrained model and tokenizer
# Load pretrained model and tokenizer
config_class, model_class, tokenizer_class = (
    AutoConfig, AutoModel, AutoTokenizer)

cache_dir = os.path.join(".", "cache_models")
config = config_class.from_pretrained(tf_from_s_path, cache_dir=cache_dir)
tokenizer = tokenizer_class.from_pretrained(
    tf_from_s_path, do_lower_case=True, cache_dir=cache_dir)
model_transformer = model_class.from_pretrained(
    tf_from_s_path, from_tf=False, config=config, cache_dir=cache_dir)





In [4]:

##################################################################################
# 使用onnx 和cuda推理部分

output_dir = os.path.join("..", "onnx_models")
export_model_path = os.path.join(output_dir, 'Multilingual_MiniLM_L12.onnx')

device_name = 'gpu'
sess_options = onnxruntime.SessionOptions()
sess_options.optimized_model_filepath = os.path.join(
    output_dir, "optimized_model_{}.onnx".format(device_name))
# Please change the value according to best setting in Performance Test Tool result.
sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
session = onnxruntime.InferenceSession(
    export_model_path, sess_options, providers=['CUDAExecutionProvider'])


2022-02-25 13:23:41.641149590 [W:onnxruntime:, inference_session.cc:1407 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.


In [5]:
##################################################################################
# 处理pooling部分



pooling_model_path = os.path.join(big_model_path, modules_config[1].get('path'))

pooling_model = Pooling.load(pooling_model_path)
# pooling_model_path

In [6]:

##################################################################################
# 生成inputs数据

st = ['您好']
inputs = tokenizer(
    st,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)
inputs

ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}
ort_outputs_gpu = session.run(None, ort_inputs)
len(ort_outputs_gpu)

2

In [7]:
[i.shape for i in ort_outputs_gpu]

[(1, 4, 384), (1, 384)]

In [8]:
import torch as t 
t.nn.functional.normalize(t.Tensor(ort_outputs_gpu[1]), p=2, dim=1)

tensor([[ 5.0621e-02,  3.7583e-02,  2.5927e-02,  4.8808e-02,  1.4685e-02,
          2.8312e-02, -5.8918e-02,  4.8319e-02,  5.4536e-02,  8.5812e-02,
         -7.2008e-02, -6.9941e-02,  1.0563e-02,  1.3090e-02, -5.9875e-02,
          6.1503e-02,  1.6375e-02, -8.7087e-02,  2.5770e-02,  6.5209e-02,
          8.9911e-02,  2.5373e-02,  5.5006e-02,  6.1519e-03,  1.3298e-03,
          2.3704e-03,  6.1129e-02, -1.7243e-02,  2.6797e-02,  7.0297e-02,
          1.1463e-02,  1.3976e-02,  3.9850e-02,  1.0488e-02,  1.0850e-02,
          1.0893e-02,  3.9947e-02,  2.2994e-02,  8.4140e-03,  5.9997e-02,
         -4.8511e-02, -7.4952e-02, -5.5370e-02,  1.4101e-04,  7.7129e-02,
         -1.2071e-01,  4.1455e-03, -1.1519e-02,  2.3726e-02, -9.0686e-02,
         -1.6727e-02,  7.4755e-02,  7.0669e-02, -7.1630e-03, -9.9254e-02,
         -2.9167e-02,  6.5008e-02, -7.0093e-02, -3.6508e-02, -8.5294e-02,
         -2.6611e-02, -5.6600e-02, -3.8600e-03,  5.5077e-02,  3.0123e-02,
          4.9215e-02,  1.9368e-02, -2.

In [9]:
# 使用原生的sentence transformer代码
model_sbert_raw = sbert(big_model_path)
raw_encode = model_sbert_raw.encode(['您好'])