# 探索下 trt 的代码

主要是 transformer-deploy_github/src/transformer_deploy/backends/trt_utils.py 下的.
主要还是熟悉下 API 和使用流程.

[TensorRT Core Concepts](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/coreConcepts.html)

The general TensorRT workflow consists of 3 steps:

1. Populate a tensorrt.INetworkDefinition either with a parser or by using the TensorRT Network API (see tensorrt.INetworkDefinition for more details). The tensorrt.Builder can be used to generate an empty tensorrt.INetworkDefinition .
2. Use the tensorrt.Builder to build a tensorrt.ICudaEngine using the populated tensorrt.INetworkDefinition .
3. Create a tensorrt.IExecutionContext from the tensorrt.ICudaEngine and use it to perform optimized inference.


补充一下 CUDA 的基础概念 [CUDA SEMANTICS](https://pytorch.org/docs/stable/notes/cuda.html#cuda-semantics)

In [28]:
import tensorrt as trt
import torch
from transformers import BertTokenizer
from tensorrt import ICudaEngine, IExecutionContext, ILayer, INetworkDefinition, Logger, Runtime
from tensorrt.tensorrt import Builder, IBuilderConfig, IElementWiseLayer, IOptimizationProfile, IReduceLayer, OnnxParser

import logging
from typing import Callable, Dict, List, Optional

In [3]:
def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int):
    """
    Calculate start/end binding indices for current context's profile
    https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#opt_profiles_bindings
    :param engine: TensorRT engine generated during the model building
    :param profile_index: profile to use (several profiles can be set during building)
    :return: input and output tensor indexes
    """
    # num_bindings 就是输入和输出的名字数量. num_optimization_profiles 是优化配置文件的数量
    num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles
    # 开始的索引位置
    start_binding = profile_index * num_bindings_per_profile
    # 结束的索引位置
    end_binding = start_binding + num_bindings_per_profile  # Separate input and output binding indices for convenience
    input_binding_idxs: List[int] = []
    output_binding_idxs: List[int] = []
    # 判断每个索引位置是否是输入, 将输入和输出的索引分别装到数组中
    for binding_index in range(start_binding, end_binding):
        if engine.binding_is_input(binding_index):
            input_binding_idxs.append(binding_index)
        else:
            output_binding_idxs.append(binding_index)
    return input_binding_idxs, output_binding_idxs

In [25]:
engine_file_path="./onnx/model_torch.trt"

# Logger for the Builder, ICudaEngine and Runtime . 日志服务类
trt_logger: Logger = trt.Logger(trt.Logger.ERROR)
# Allows a serialized ICudaEngine to be deserialized. 运行时, 用来承载模型
runtime: Runtime = trt.Runtime(trt_logger)
profile_index = 0

with open(file=engine_file_path, mode="rb") as f:
    # 反序列化, 返回 ICudaEngine. An ICudaEngine for executing inference on a built network.
    engine: ICudaEngine = runtime.deserialize_cuda_engine(f.read())
    # A CUDA stream is a linear sequence of execution that belongs to a specific device, independent from other streams. See CUDA semantics for details.
    # .cuda_stream 属性好像不在 torch 的文档上. 而且类型是 int
    stream: int = torch.cuda.current_stream().cuda_stream
    # Create an IExecutionContext . 创建一个执行上下文
    # Context for executing inference using an ICudaEngine . Multiple IExecutionContext s may exist for one ICudaEngine instance, allowing the same ICudaEngine to be used for the execution of multiple batches simultaneously.
    context: IExecutionContext = engine.create_execution_context()
    # Set the optimization profile with async semantics. 加载优化配置文件
    context.set_optimization_profile_async(profile_index=profile_index, stream_handle=stream)
    # retrieve input/output IDs. 获取输入和输出的索引
    input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, profile_index)  # type: List[int], List[int]



In [37]:
# 目前看到的好多方法, 都是要求这个值为 False 的
engine.has_implicit_batch_dimension

False

In [33]:
import onnx
import torch
import numpy
from transformers import BertTokenizer
enc = BertTokenizer.from_pretrained('bert-base-uncased')

masked_sentences = ['Paris is the [MASK] of France.', 
                    'The primary [MASK] of the United States is English.', 
                    'A baseball game consists of at least nine [MASK].', 
                    'Topology is a branch of [MASK] concerned with the properties of geometric objects that remain unchanged under continuous transformations.']
pos_masks = [4, 3, 9, 6]

inputs = enc(masked_sentences, return_tensors="np", padding='max_length', max_length=128, truncation=True)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [35]:
inputs = enc(masked_sentences[:1], return_tensors="pt", padding='max_length', max_length=128, truncation=True)
inputs["input_ids"].device.type

'cpu'

In [38]:
def get_output_tensors(
    context: trt.IExecutionContext,
    host_inputs: List[torch.Tensor],
    input_binding_idxs: List[int],
    output_binding_idxs: List[int],
) -> Dict[str, torch.Tensor]:
    """
    保留内存.
    Reserve memory in GPU for input and output tensors.
    :param context: TensorRT context shared accross inference steps
    :param host_inputs: input tensor
    :param input_binding_idxs: indexes of each input vector (should be the same than during building)
    :param output_binding_idxs: indexes of each output vector (should be the same than during building)
    :return: tensors where output will be stored
    """
    # explicitly set dynamic input shapes, so dynamic output shapes can be computed internally
    for host_input, binding_index in zip(host_inputs, input_binding_idxs):
        # Set the dynamic shape of a binding. 设置动态形状, 根据这个输入的形状
        context.set_binding_shape(binding_index, tuple(host_input.shape))
    # assert context.all_binding_shapes_specified
    device_outputs: Dict[str, torch.Tensor] = dict()
    for binding_index in output_binding_idxs:
        # 获取输出的形状
        # TensorRT computes output shape based on input shape provided above
        output_shape = context.get_binding_shape(binding=binding_index)
        # 输出的名字
        output_name = context.engine.get_binding_name(index=binding_index)
        # 分配 GPU 内存空间
        # allocate buffers to hold output results
        device_outputs[output_name] = torch.empty(tuple(output_shape), device="cuda")
    return device_outputs

In [39]:
def infer_tensorrt(
    context: IExecutionContext,
    inputs: Dict[str, torch.Tensor],
    input_binding_idxs: List[int],
    output_binding_idxs: List[int],
) -> Dict[str, torch.Tensor]:
    """
    执行推理
    Perform inference with TensorRT.
    :param context: shared variable
    :param inputs: input tensor
    :param input_binding_idxs: input tensor indexes
    :param output_binding_idxs: output tensor indexes
    :return: output Dict[tensor name, tensor value]
    """

    input_tensors: List[torch.Tensor] = list()
    # 对于每个名字的索引位置. 现在是按名字读取的输入, 所以 dict 的顺序就不重要了
    for i in range(context.engine.num_bindings):
        # 判断是否是输入位置
        if not context.engine.binding_is_input(index=i):
            continue
        # 输入的名字
        tensor_name = context.engine.get_binding_name(i)
        assert tensor_name in inputs, f"input not provided: {tensor_name}"
        tensor = inputs[tensor_name]
        assert isinstance(tensor, torch.Tensor), f"unexpected tensor class: {type(tensor)}"
        # 需要在 cuda 上. 原来如此, 旧的 v0.4.0 版本会手动给你复制到 cuda 上, 所以不会有这个报错
        assert tensor.device.type == "cuda", f"unexpected device type (trt only works on CUDA): {tensor.device.type}"
        # 类型会被截断到 int32 上
        # warning: small changes in output if int64 is used instead of int32
        if tensor.dtype in [torch.int64, torch.long]:
            logging.warning(f"using {tensor.dtype} instead of int32 for {tensor_name}, will be casted to int32")
            tensor = tensor.type(torch.int32)
        input_tensors.append(tensor)
    # 继续深入, 实际上还是要看这个函数
    # calculate input shape, bind it, allocate GPU memory for the output
    outputs: Dict[str, torch.Tensor] = get_output_tensors(
        context, input_tensors, input_binding_idxs, output_binding_idxs
    )
    # data_prt 返回第一个元素的地址 Returns the address of the first element of self tensor.
    bindings = [int(i.data_ptr()) for i in input_tensors + list(outputs.values())]
    # Asynchronously execute inference on a batch. 这个是异步执行的, 所以下面需要使用 synchronize
    assert context.execute_async_v2(
        bindings, torch.cuda.current_stream().cuda_stream
    ), "failure during execution of inference"
    # 等待完成, 相当于强制同步
    torch.cuda.current_stream().synchronize()  # sync all CUDA ops

    return outputs

In [45]:
# 这些就是输入和输出的维度, 这个trt模型没有动态维度
print(input_binding_idxs, output_binding_idxs)
for x in input_binding_idxs + output_binding_idxs:
    print(context.get_binding_shape(x))

[0, 1, 2] [3]
(1, 128)
(1, 128)
(1, 128)
(1, 128, 30522)


In [46]:
# 可以了, 应该要尝试进行推理了
inputs = enc(masked_sentences[:1], return_tensors="pt", padding='max_length', max_length=128, truncation=True)
inputs = dict((k, v.to("cuda")) for k, v in inputs.items())
infer_tensorrt(context, inputs, input_binding_idxs, output_binding_idxs)



{'logits': tensor([[[-6.5416, -6.5076, -6.5212,  ..., -5.8961, -5.7352, -3.8943],
          [-9.0158, -9.0488, -9.0614,  ..., -8.2589, -8.0348, -6.1794],
          [-8.6559, -9.0902, -8.7755,  ..., -7.4504, -5.3415, -9.6577],
          ...,
          [-8.8079, -9.0085, -8.9305,  ..., -8.1445, -9.2694, -5.3116],
          [-8.7577, -8.8732, -8.8879,  ..., -8.3378, -9.3688, -4.9300],
          [-8.7828, -9.0249, -8.9177,  ..., -8.3143, -8.8779, -6.5312]]],
        device='cuda:0')}