In [1]:
import sys

import os
import argparse
import tensorrt as trt

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)


output_dir = os.path.join("..", "onnx_models")




onnx_file_path = os.path.join(output_dir, "Multilingual_MiniLM_L12.onnx")
engine_file_path = os.path.join(output_dir, "test_v1.plan")
print('get start')
TRT_LOGGER = trt.Logger()
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
    config = builder.create_builder_config()
    #builder.max_workspace_size =( 1 << 30 ) * 2
    config.max_workspace_size =( 1 << 20 ) * 3 * 1024 # 3GB，可以根据需求改的更大
    builder.max_batch_size = 128
    config.set_flag(trt.BuilderFlag.FP16)
    #builder.fp16_mode = True
    # Parse model file
if not os.path.exists(onnx_file_path):
    print('ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.'.format(onnx_file_path))
    exit(0)
print('Loading ONNX file from path {}...'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
    print('Beginning ONNX file parsing')
    if not parser.parse(model.read()):
        print ('ERROR: Failed to parse the ONNX file.')
        for error in range(parser.num_errors):
            print (parser.get_error(error))
print(f"raw shape of {network.get_input(0).name} is: ", network.get_input(0).shape)
profile = builder.create_optimization_profile()
for temp_name in ['input_ids', 'attention_mask','token_type_ids']:
    profile.set_shape(
        input=temp_name,
        min=(1,1),
        opt=(1,64),
        max=(64,128),
    )
config.add_optimization_profile(profile)
print('Completed parsing of ONNX file')
print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
engine = builder.build_engine(network,config)
print("Completed creating Engine")
with open(engine_file_path, "wb") as f:
    f.write(engine.serialize())

get start
Loading ONNX file from path ../onnx_models/Multilingual_MiniLM_L12.onnx...
Beginning ONNX file parsing
[02/25/2022-07:41:56] [TRT] [W] onnx2trt_utils.cpp:366: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[02/25/2022-07:42:00] [TRT] [W] Output type must be INT32 for shape outputs
[02/25/2022-07:42:00] [TRT] [W] Output type must be INT32 for shape outputs
[02/25/2022-07:42:00] [TRT] [W] Output type must be INT32 for shape outputs
[02/25/2022-07:42:00] [TRT] [W] Output type must be INT32 for shape outputs
raw shape of input_ids is:  (-1, -1)
Completed parsing of ONNX file
Building an engine from file ../onnx_models/Multilingual_MiniLM_L12.onnx; this may take a while...
[02/25/2022-07:42:00] [TRT] [W] Half2 support requested on hardware without native FP16 support, performance will be negatively affected.


  engine = builder.build_engine(network,config)


[02/25/2022-07:42:01] [TRT] [W] TensorRT was linked against cuBLAS/cuBLAS LT 11.6.5 but loaded cuBLAS/cuBLAS LT 11.6.1
[02/25/2022-07:42:01] [TRT] [W] Myelin graph with multiple dynamic values may have poor performance if they differ. Dynamic values are: 
[02/25/2022-07:42:01] [TRT] [W]  (# 1 (SHAPE input_ids))
[02/25/2022-07:42:01] [TRT] [W]  (# 0 (SHAPE attention_mask))
[02/25/2022-07:42:21] [TRT] [W] Myelin graph with multiple dynamic values may have poor performance if they differ. Dynamic values are: 
[02/25/2022-07:42:21] [TRT] [W]  (# 1 (SHAPE input_ids))
[02/25/2022-07:42:21] [TRT] [W]  (# 0 (SHAPE attention_mask))
[02/25/2022-07:42:25] [TRT] [W] TensorRT was linked against cuBLAS/cuBLAS LT 11.6.5 but loaded cuBLAS/cuBLAS LT 11.6.1
Completed creating Engine


In [2]:
import tensorrt as trt
import sys
import numpy as np
trt_logger = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(trt_logger)
with open(engine_file_path, "rb") as f:
    engine = runtime.deserialize_cuda_engine(f.read())
print("Engine Info:")
for i, binding in enumerate(engine):
    shape = [engine.max_batch_size, *engine.get_binding_shape(binding)]
    dtype = trt.nptype(engine.get_binding_dtype(binding))
    volume = abs(trt.volume(engine.get_binding_shape(binding)))
    if engine.binding_is_input(binding):
        desc = "input"
    else:
        desc = "output"
    print(f"{i} type:    {desc}\n  binding: {binding} \n  data:    {np.dtype(dtype).name}\n  shape:   {shape} => {volume} \n")


[02/25/2022-07:42:27] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.

[02/25/2022-07:42:27] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 1574, GPU 1969 (MiB)
[02/25/2022-07:42:28] [TRT] [I] Loaded engine size: 901 MiB
[02/25/2022-07:42:28] [TRT] [W] TensorRT was linked against cuBLAS/cuBLAS LT 11.6.5 but loaded cuBLAS/cuBLAS LT 11.6.1
[02/25/2022-07:42:28] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +10, now: CPU 2928, GPU 2429 (MiB)
[02/25/2022-07:42:28] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 2928, GPU 2437 (MiB)
[02/25/2022-07:42:28] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +448, now: CPU 0, GPU 960 (MiB)
Engine Info:
0 type:    input
  binding: input_ids 
  data:    int32
  shape:   [128, -1, -1