# Inference sentence-transformer Model with ONNX Runtime on CPU
inference `0_Transformer` of sentence-transformer, not the entire model


In [None]:
import os
import json

# Here we use paraphrase-multilingual-MiniLM-L12-v2 for demo.
big_model_path = "../models/paraphrase-multilingual-MiniLM-L12-v2"


: 

# 1. Load Pretrained Bert model

In [2]:
modules_json_path = os.path.join(big_model_path, 'modules.json')
with open(modules_json_path) as fIn:
    modules_config = json.load(fIn)

tf_from_s_path = os.path.join(big_model_path, modules_config[0].get('path'))
print(tf_from_s_path)

../models/paraphrase-multilingual-MiniLM-L12-v2/0_Transformer


Specify some model configuration variables and constant.

In [3]:

max_seq_length = 128
doc_stride = 128
max_query_length = 64

# Enable overwrite to export onnx model and download latest script each time when running this notebook.
enable_overwrite = True

# Total samples to inference. It shall be large enough to get stable latency measurement.
total_samples = 1000


In [4]:
cache_dir = os.path.join(".", "cache_models")
cache_dir
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

In [5]:
# # Load pretrained model and tokenizer

from transformers import (AutoConfig, AutoModel, AutoTokenizer)

# Load pretrained model and tokenizer
config_class, model_class, tokenizer_class = (AutoConfig, AutoModel, AutoTokenizer)

config = config_class.from_pretrained(tf_from_s_path, cache_dir=cache_dir)
tokenizer = tokenizer_class.from_pretrained(tf_from_s_path, do_lower_case=True, cache_dir=cache_dir)
model = model_class.from_pretrained(tf_from_s_path, from_tf=False, config=config, cache_dir=cache_dir)


In [6]:
# Get the first example data to run the model and export it to ONNX

st = ['您好']
inputs = tokenizer(
    st,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)
inputs

{'input_ids': tensor([[    0, 73014,  1322,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

# 2. Export the loaded model
Once the model is loaded, we can export the loaded PyTorch model to ONNX.

In [7]:
output_dir = os.path.join("..", "onnx_models")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
export_model_path = os.path.join(output_dir, 'Multilingual_MiniLM_L12.onnx')

import torch
device = torch.device("cpu")



# Set model to inference mode, which is required before exporting the model because some operators behave differently in
# inference and training mode.
model.eval()
model.to(device)

if enable_overwrite or not os.path.exists(export_model_path):
    with torch.no_grad():
        symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
        torch.onnx.export(model,                                            # model being run
                          args=tuple(inputs.values()),                      # model input (or a tuple for multiple inputs)
                          f=export_model_path,                              # where to save the model (can be a file or file-like object)
                          opset_version=11,                                 # the ONNX version to export the model to
                          do_constant_folding=True,                         # whether to execute constant folding for optimization
                          input_names=['input_ids',                         # the model's input names
                                       'attention_mask',
                                       'token_type_ids'],
                          output_names=['start', 'end'],                    # the model's output names
                          dynamic_axes={'input_ids': symbolic_names,        # variable length axes
                                        'attention_mask' : symbolic_names,
                                        'token_type_ids' : symbolic_names,
                                        'start' : symbolic_names,
                                        'end' : symbolic_names})
        print("Model exported at ", export_model_path)

Model exported at  ../onnx_models/Multilingual_MiniLM_L12.onnx


# 3. PyTorch Inference
Use PyTorch to evaluate an example input for comparison purpose.

In [8]:
import time
from tqdm import tqdm
# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.
latency = []
with torch.no_grad():
    for i in tqdm(range(total_samples)):
        # data = dataset[i]
        # inputs = {
        #     'input_ids':      data[0].to(device).reshape(1, max_seq_length),
        #     'attention_mask': data[1].to(device).reshape(1, max_seq_length),
        #     'token_type_ids': data[2].to(device).reshape(1, max_seq_length)
        # }
        start = time.time()
        outputs = model(**inputs)
        latency.append(time.time() - start)
print("PyTorch {} Inference time = {} ms".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))

100%|██████████| 1000/1000 [00:16<00:00, 61.52it/s]

PyTorch cpu Inference time = 16.09 ms





# 4. Inference ONNX Model with ONNX Runtime
For Onnx Runtime 1.6.0 or older, OpenMP environment variables are very important for CPU inference of Bert model. Since 1.7.0, the official package is not built with OpenMP.

Now we inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch.

In [10]:

import onnxruntime
import numpy

sess_options = onnxruntime.SessionOptions()

# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.
# Note that this will increase session creation time, so it is for debugging only.
sess_options.optimized_model_filepath = os.path.join(output_dir, "Multilingual-MiniLM-L12_optimized_model_cpu.onnx")

# For OnnxRuntime 1.7.0 or later, you can set intra_op_num_threads to set thread number like
#    sess_options.intra_op_num_threads=4
# Here we use the default value which is a good choice in most cases.

# Specify providers when you use onnxruntime-gpu for CPU inference.
session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])

latency = []
for i in tqdm(range(2000)):
    # data = dataset[i]
    ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}
    # ort_inputs = {
    #     'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
    #     'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
    #     'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
    # }
    start = time.time()
    ort_outputs = session.run(None, ort_inputs)
    latency.append(time.time() - start)
print("OnnxRuntime cpu Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))

2022-02-24 11:21:16.715169653 [W:onnxruntime:, inference_session.cc:1407 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.
100%|██████████| 2000/2000 [00:14<00:00, 137.57it/s]

OnnxRuntime cpu Inference time = 7.13 ms





# 5. Verifying

In [11]:
print("***** Verifying correctness *****")
for i in range(1):
    print('PyTorch and ONNX Runtime output {} are close:'.format(i), numpy.allclose(ort_outputs[i], outputs[i].cpu(), rtol=1e-05, atol=1e-05))

***** Verifying correctness *****
PyTorch and ONNX Runtime output 0 are close: True
