In [1]:
import sys

import os
import argparse
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)


output_dir = os.path.join("..", "onnx_models")


onnx_file_path = os.path.join(output_dir, "Multilingual_MiniLM_L12.onnx")
engine_file_path = os.path.join(output_dir, "test_v1.plan")
print('get start')
TRT_LOGGER = trt.Logger()
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
    config = builder.create_builder_config()
    #builder.max_workspace_size =( 1 << 30 ) * 2
    config.max_workspace_size =( 1 << 20 ) * 3 * 1024 # 3GB，可以根据需求改的更大
    builder.max_batch_size = 128
    config.set_flag(trt.BuilderFlag.FP16)
    #builder.fp16_mode = True
    # Parse model file
if not os.path.exists(onnx_file_path):
    print('ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path))
    exit(0)
print('Loading ONNX file from path {}...'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
    print('Beginning ONNX file parsing')
    if not parser.parse(model.read()):
        print ('ERROR: Failed to parse the ONNX file.')
        for error in range(parser.num_errors):
            print (parser.get_error(error))
print(f"raw shape of {network.get_input(0).name} is: ", network.get_input(0).shape)
profile = builder.create_optimization_profile()
for temp_name in ['input_ids', 'attention_mask','token_type_ids']:
    profile.set_shape(
        input=temp_name,
        min=(1,1),
        opt=(1,64),
        max=(64,128),
    )
config.add_optimization_profile(profile)
print('Completed parsing of ONNX file')
print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
engine = builder.build_engine(network,config)
print("Completed creating Engine")
with open(engine_file_path, "wb") as f:
    f.write(engine.serialize())

get start
Loading ONNX file from path ../onnx_models/Multilingual_MiniLM_L12.onnx...
Beginning ONNX file parsing
[02/25/2022-09:28:15] [TRT] [W] onnx2trt_utils.cpp:366: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[02/25/2022-09:28:19] [TRT] [W] Output type must be INT32 for shape outputs
[02/25/2022-09:28:19] [TRT] [W] Output type must be INT32 for shape outputs
[02/25/2022-09:28:19] [TRT] [W] Output type must be INT32 for shape outputs
[02/25/2022-09:28:19] [TRT] [W] Output type must be INT32 for shape outputs
raw shape of input_ids is:  (-1, -1)
Completed parsing of ONNX file
Building an engine from file ../onnx_models/Multilingual_MiniLM_L12.onnx; this may take a while...


  engine = builder.build_engine(network,config)


[02/25/2022-09:28:20] [TRT] [W] TensorRT was linked against cuBLAS/cuBLAS LT 11.6.5 but loaded cuBLAS/cuBLAS LT 11.3.1
[02/25/2022-09:28:21] [TRT] [W] TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.1.1
[02/25/2022-09:28:21] [TRT] [W] Myelin graph with multiple dynamic values may have poor performance if they differ. Dynamic values are: 
[02/25/2022-09:28:21] [TRT] [W]  (# 1 (SHAPE input_ids))
[02/25/2022-09:28:21] [TRT] [W]  (# 0 (SHAPE attention_mask))
[02/25/2022-09:30:37] [TRT] [W] Myelin graph with multiple dynamic values may have poor performance if they differ. Dynamic values are: 
[02/25/2022-09:30:37] [TRT] [W]  (# 1 (SHAPE input_ids))
[02/25/2022-09:30:37] [TRT] [W]  (# 0 (SHAPE attention_mask))
[02/25/2022-09:33:13] [TRT] [W] Myelin graph with multiple dynamic values may have poor performance if they differ. Dynamic values are: 
[02/25/2022-09:33:13] [TRT] [W]  (# 1 (SHAPE input_ids))
[02/25/2022-09:33:13] [TRT] [W]  (# 0 (SHAPE attention_mask))
[02/25/2022-09:33:

In [2]:
import tensorrt as trt
import sys
import numpy as np
trt_logger = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(trt_logger)
with open(engine_file_path, "rb") as f:
    engine = runtime.deserialize_cuda_engine(f.read())
print("Engine Info:")
for i, binding in enumerate(engine):
    shape = [engine.max_batch_size, *engine.get_binding_shape(binding)]
    dtype = trt.nptype(engine.get_binding_dtype(binding))
    volume = abs(trt.volume(engine.get_binding_shape(binding)))
    if engine.binding_is_input(binding):
        desc = "input"
    else:
        desc = "output"
    print(f"{i} type:    {desc}\n  binding: {binding} \n  data:    {np.dtype(dtype).name}\n  shape:   {shape} => {volume} \n")


[02/25/2022-09:33:24] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.

[02/25/2022-09:33:24] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 1860, GPU 3108 (MiB)
[02/25/2022-09:33:25] [TRT] [I] Loaded engine size: 452 MiB
[02/25/2022-09:33:25] [TRT] [W] TensorRT was linked against cuBLAS/cuBLAS LT 11.6.5 but loaded cuBLAS/cuBLAS LT 11.3.1
[02/25/2022-09:33:25] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 2541, GPU 3342 (MiB)
[02/25/2022-09:33:25] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 2541, GPU 3350 (MiB)
[02/25/2022-09:33:25] [TRT] [W] TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.1.1
[02/25/2022-09:33:25] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +224, now: CPU 0, GPU 480 (MiB

In [3]:
import os
import json

# Here we use paraphrase-multilingual-MiniLM-L12-v2 for demo.
big_model_path = "../models/paraphrase-multilingual-MiniLM-L12-v2"


modules_json_path = os.path.join(big_model_path, 'modules.json')
with open(modules_json_path) as fIn:
    modules_config = json.load(fIn)

tf_from_s_path = os.path.join(big_model_path, modules_config[0].get('path'))
print(tf_from_s_path)



max_seq_length = 128
doc_stride = 128
max_query_length = 64

# Enable overwrite to export onnx model and download latest script each time when running this notebook.
enable_overwrite = True

# Total samples to inference. It shall be large enough to get stable latency measurement.
total_samples = 1000

cache_dir = os.path.join(".", "cache_models")
cache_dir
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

# # Load pretrained model and tokenizer

from transformers import (AutoConfig, AutoModel, AutoTokenizer)

# Load pretrained model and tokenizer
config_class, model_class, tokenizer_class = (AutoConfig, AutoModel, AutoTokenizer)

config = config_class.from_pretrained(tf_from_s_path, cache_dir=cache_dir)
tokenizer = tokenizer_class.from_pretrained(tf_from_s_path, do_lower_case=True, cache_dir=cache_dir)
model = model_class.from_pretrained(tf_from_s_path, from_tf=False, config=config, cache_dir=cache_dir)


# Get the first example data to run the model and export it to ONNX

st = ['您好']
inputs = tokenizer(
    st,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)
inputs

../models/paraphrase-multilingual-MiniLM-L12-v2/0_Transformer


{'input_ids': tensor([[    0, 73014,  1322,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [8]:
context = engine.create_execution_context()

[02/25/2022-09:16:05] [TRT] [W] TensorRT was linked against cuBLAS/cuBLAS LT 11.6.5 but loaded cuBLAS/cuBLAS LT 11.3.1
[02/25/2022-09:16:05] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 5149, GPU 2071 (MiB)
[02/25/2022-09:16:05] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 5149, GPU 2081 (MiB)
[02/25/2022-09:16:05] [TRT] [W] TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.1.1
[02/25/2022-09:16:06] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +162, now: CPU 0, GPU 386 (MiB)


In [4]:

# inputs_dim_name = ['inputs_ids', 'attention_mask', 'token_type_ids']

# outputs_dims_name = ['start', 'end']


# input_idx = engine["inputs_ids"]



In [11]:
#  Copyright 2022, Lefebvre Dalloz Services
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
All the tooling to ease TensorRT usage.
"""

from typing import Callable, Dict, List, OrderedDict, Tuple

import tensorrt as trt
import torch
from tensorrt import ICudaEngine, IExecutionContext
from tensorrt.tensorrt import (
    Builder,
    IBuilderConfig,
    IElementWiseLayer,
    ILayer,
    INetworkDefinition,
    IOptimizationProfile,
    IReduceLayer,
    Logger,
    OnnxParser,
    Runtime,
)


def fix_fp16_network(network_definition: INetworkDefinition) -> INetworkDefinition:
    """
    Mixed precision on TensorRT can generate scores very far from Pytorch because of some operator being saturated.
    Indeed, FP16 can't store very large and very small numbers like FP32.
    Here, we search for some patterns of operators to keep in FP32, in most cases, it is enough to fix the inference
    and don't hurt performances.
    :param network_definition: graph generated by TensorRT after parsing ONNX file (during the model building)
    :return: patched network definition
    """
    # search for patterns which may overflow in FP16 precision, we force FP32 precisions for those nodes
    for layer_index in range(network_definition.num_layers - 1):
        layer: ILayer = network_definition.get_layer(layer_index)
        next_layer: ILayer = network_definition.get_layer(layer_index + 1)
        # POW operation usually followed by mean reduce
        if layer.type == trt.LayerType.ELEMENTWISE and next_layer.type == trt.LayerType.REDUCE:
            # casting to get access to op attribute
            layer.__class__ = IElementWiseLayer
            next_layer.__class__ = IReduceLayer
            if layer.op == trt.ElementWiseOperation.POW:
                layer.precision = trt.DataType.FLOAT
                next_layer.precision = trt.DataType.FLOAT
            layer.set_output_type(index=0, dtype=trt.DataType.FLOAT)
            next_layer.set_output_type(index=0, dtype=trt.DataType.FLOAT)
    return network_definition


def build_engine(
    runtime: Runtime,
    onnx_file_path: str,
    logger: Logger,
    min_shape: Tuple[int, int],
    optimal_shape: Tuple[int, int],
    max_shape: Tuple[int, int],
    workspace_size: int,
    fp16: bool,
    int8: bool,
) -> ICudaEngine:
    """
    Convert ONNX file to TensorRT engine.
    It supports dynamic shape, however it's advised to keep sequence length fix as it hurts performance otherwise.
    Dynamic batch size don't hurt performance and is highly advised.
    :param runtime: global variable shared accross inference call / model building
    :param onnx_file_path: path to the ONNX file
    :param logger: specific logger to TensorRT
    :param min_shape: the minimal shape of input tensors. It's advised to set first dimension (batch size) to 1
    :param optimal_shape: input tensor shape used for optimizations
    :param max_shape: maximal input tensor shape
    :param workspace_size: GPU memory to use during the building, more is always better. If there is not enough memory,
    some optimization may fail, and the whole conversion process will crash.
    :param fp16: enable FP16 precision, it usually provide a 20-30% boost compared to ONNX Runtime.
    :param int8: enable INT-8 quantization, best performance but model should have been quantized.
    :return: TensorRT engine to use during inference
    """
    with trt.Builder(logger) as builder:  # type: Builder
        with builder.create_network(
            flags=1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
        ) as network_definition:  # type: INetworkDefinition
            with trt.OnnxParser(network_definition, logger) as parser:  # type: OnnxParser
                builder.max_batch_size = max_shape[0]  # max batch size
                config: IBuilderConfig = builder.create_builder_config()
                config.max_workspace_size = workspace_size
                # to enable complete trt inspector debugging, only for TensorRT >= 8.2
                # config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
                # disable CUDNN optimizations
                config.set_tactic_sources(
                    tactic_sources=1 << int(trt.TacticSource.CUBLAS) | 1 << int(trt.TacticSource.CUBLAS_LT)
                )
                if int8:
                    config.set_flag(trt.BuilderFlag.INT8)
                if fp16:
                    config.set_flag(trt.BuilderFlag.FP16)
                config.set_flag(trt.BuilderFlag.DISABLE_TIMING_CACHE)
                # https://github.com/NVIDIA/TensorRT/issues/1196 (sometimes big diff in output when using FP16)
                config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
                with open(onnx_file_path, "rb") as f:
                    parser.parse(f.read())
                profile: IOptimizationProfile = builder.create_optimization_profile()
                for num_input in range(network_definition.num_inputs):
                    profile.set_shape(
                        input=network_definition.get_input(num_input).name,
                        min=min_shape,
                        opt=optimal_shape,
                        max=max_shape,
                    )
                config.add_optimization_profile(profile)
                if fp16:
                    network_definition = fix_fp16_network(network_definition)
                trt_engine = builder.build_serialized_network(network_definition, config)
                engine: ICudaEngine = runtime.deserialize_cuda_engine(trt_engine)
                assert engine is not None, "error during engine generation, check error messages above :-("
                return engine


def get_output_tensors(
    context: trt.IExecutionContext,
    host_inputs: List[torch.Tensor],
    input_binding_idxs: List[int],
    output_binding_idxs: List[int],
) -> List[torch.Tensor]:
    """
    Reserve memory in GPU for input and output tensors.
    :param context: TensorRT context shared accross inference steps
    :param host_inputs: input tensor
    :param input_binding_idxs: indexes of each input vector (should be the same than during building)
    :param output_binding_idxs: indexes of each output vector (should be the same than during building)
    :return: tensors where output will be stored
    """
    # explicitly set dynamic input shapes, so dynamic output shapes can be computed internally
    for host_input, binding_index in zip(host_inputs, input_binding_idxs):
        context.set_binding_shape(binding_index, tuple(host_input.shape))
    assert context.all_binding_shapes_specified
    device_outputs: List[torch.Tensor] = []
    for binding_index in output_binding_idxs:
        # TensorRT computes output shape based on input shape provided above
        output_shape = context.get_binding_shape(binding_index)
        # allocate buffers to hold output results
        output = torch.empty(tuple(output_shape), device="cuda")
        device_outputs.append(output)
    return device_outputs


def infer_tensorrt(
    context: IExecutionContext,
    host_inputs: OrderedDict[str, torch.Tensor],
    input_binding_idxs: List[int],
    output_binding_idxs: List[int],
) -> List[torch.Tensor]:
    """
    Perform inference with TensorRT.
    :param context: shared variable
    :param host_inputs: input tensor
    :param input_binding_idxs: input tensor indexes
    :param output_binding_idxs: output tensor indexes
    :return: output tensor
    """
    input_tensors: List[torch.Tensor] = list()
    for tensor in host_inputs.values():
        assert isinstance(tensor, torch.Tensor), f"unexpected tensor type: {tensor.dtype}"
        # warning: small changes in output if int64 is used instead of int32
        tensor = tensor.type(torch.int32)
        tensor = tensor.to("cuda")
        input_tensors.append(tensor)
    # calculate input shape, bind it, allocate GPU memory for the output
    output_tensors: List[torch.Tensor] = get_output_tensors(
        context, input_tensors, input_binding_idxs, output_binding_idxs
    )
    bindings = [int(i.data_ptr()) for i in input_tensors + output_tensors]
    assert context.execute_async_v2(
        bindings, torch.cuda.current_stream().cuda_stream
    ), "failure during execution of inference"
    torch.cuda.current_stream().synchronize()  # sync all CUDA ops
    return output_tensors


def load_engine(
    runtime: Runtime, engine_file_path: str, profile_index: int = 0
) -> Callable[[Dict[str, torch.Tensor]], torch.Tensor]:
    """
    Load serialized TensorRT engine.
    :param runtime: shared variable
    :param engine_file_path: path to the serialized engine
    :param profile_index: which profile to load, 0 if you have not used multiple profiles
    :return: A function to perform inference
    """
    with open(file=engine_file_path, mode="rb") as f:
        engine: ICudaEngine = runtime.deserialize_cuda_engine(f.read())
        stream: int = torch.cuda.current_stream().cuda_stream
        context: IExecutionContext = engine.create_execution_context()
        context.set_optimization_profile_async(profile_index=profile_index, stream_handle=stream)
        # retrieve input/output IDs
        input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, profile_index)  # type: List[int], List[int]

        def tensorrt_model(inputs: Dict[str, torch.Tensor]) -> torch.Tensor:
            return infer_tensorrt(
                context=context,
                host_inputs=inputs,
                input_binding_idxs=input_binding_idxs,
                output_binding_idxs=output_binding_idxs,
            )

        return tensorrt_model


def save_engine(engine: ICudaEngine, engine_file_path: str) -> None:
    """
    Serialize TensorRT engine to file.
    :param engine: TensorRT engine
    :param engine_file_path: output path
    """
    with open(engine_file_path, "wb") as f:
        f.write(engine.serialize())


def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int):
    """
    Calculate start/end binding indices for current context's profile
    https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#opt_profiles_bindings
    :param engine: TensorRT engine generated during the model building
    :param profile_index: profile to use (several profiles can be set during building)
    :return: input and output tensor indexes
    """
    num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles
    start_binding = profile_index * num_bindings_per_profile
    end_binding = start_binding + num_bindings_per_profile  # Separate input and output binding indices for convenience
    input_binding_idxs: List[int] = []
    output_binding_idxs: List[int] = []
    for binding_index in range(start_binding, end_binding):
        if engine.binding_is_input(binding_index):
            input_binding_idxs.append(binding_index)
        else:
            output_binding_idxs.append(binding_index)
    return input_binding_idxs, output_binding_idxs


In [12]:
engine = load_engine(engine_file_path=engine_file_path)

TypeError: load_engine() missing 1 required positional argument: 'runtime'