In [None]:
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import os
import ctypes
# cuda: https://nvidia.github.io/cuda-python/
import pycuda.driver as cuda
import tensorrt as trt
import torch
import pycuda.autoinit

soFile = "./layernorm_plugin.so"
epsilon = 1.0e-2
np.random.seed(97)

In [None]:
logger = trt.Logger(trt.Logger.ERROR)
trt.init_libnvinfer_plugins(logger, '')
ctypes.cdll.LoadLibrary(soFile)
soFile = "./gelu.so"
ctypes.cdll.LoadLibrary(soFile)

In [None]:
soFile = "./layernorm_chfirst_plugin.so"
ctypes.cdll.LoadLibrary(soFile)

In [None]:
def GiB(val):
    return val * 1 << 30

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem
    
    def free(self):
        self.host = None
        if self.device is not None:
            self.device.free()
            self.device = None
    
    def __del__(self):
        self.free()
    
    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()
    
    
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(ori_inputs, ori_outputs, engine, context, stream):
    inputs = []
    outputs = []
    bindings = []
    nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])
    
    for i, binding in enumerate(engine):
        size = trt.volume(context.get_binding_shape(i))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        try:
            if engine.binding_is_input(binding):
                ori_mem = ori_inputs[i]
            else:
                ori_mem = ori_outputs[i - nInput]
        except:
            ori_mem = None
            
        if ori_mem is not None:
            if ori_mem.host.nbytes >= size:
                host_mem = ori_mem.host
                device_mem = ori_mem.device
                # 避免再次释放
                ori_mem.device = None
            else:
                ori_mem.free()
                host_mem = cuda.pagelocked_empty(size, dtype)
                device_mem = cuda.mem_alloc(host_mem.nbytes)
        else:
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings

In [None]:
def build_engine(onnx_file_path, enable_fp16 = False, max_batch_size = 256, max_workspace_size = 10, write_engine=True):
    # 通过加载onnx文件，构建engine
    # :param onnx_file_path: onnx文件路径
    # :return: engine
    G_LOGGER = trt.Logger(trt.Logger.WARNING)
    
    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    with trt.Builder(G_LOGGER) as builder, builder.create_network(explicit_batch) as network, \
            trt.OnnxParser(network, G_LOGGER) as parser:
        
        config = builder.create_builder_config()
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, GiB(max_workspace_size))
        if enable_fp16:
            config.set_flag(trt.BuilderFlag.FP16)
        print('Loading ONNX file from path {}...'.format(onnx_file_path))
        with open(onnx_file_path, 'rb') as model:
            print('Beginning ONNX file parsing')
            parser.parse(model.read())
        print('Completed parsing of ONNX file')
        print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
        # 重点
        profile = builder.create_optimization_profile()
        profile.set_shape("input", (1, 3, 224, 224), (max_batch_size, 3, 224, 224), (max_batch_size, 3, 224, 224))
        config.add_optimization_profile(profile)

        serialized_engine = builder.build_serialized_network(network, config)
        print("Completed creating Engine")
        # 保存engine文件
        if write_engine:
            
            onnx_path = os.path.realpath(onnx_file_path) 
            engine_file_path = ".".join(onnx_path.split('.')[:-1] + ['trt'])
            with open(engine_file_path, "wb") as f:
                f.write(serialized_engine)
        return serialized_engine

In [None]:
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

In [None]:
class TRTClassify(object):
    def __init__(self, engine_path):
        self.engine_path = engine_path
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.engine = self._get_engine()
        self.context = self.engine.create_execution_context()
        self.stream = cuda.Stream()
        self.inputs = None
        self.outputs = None


    def _get_engine(self):
        # If a serialized engine exists, use it instead of building an engine.
        f = open(self.engine_path, 'rb')
        runtime = trt.Runtime(self.logger)
        return runtime.deserialize_cuda_engine(f.read())


    def detect(self, image_np_array, cuda_ctx = pycuda.autoinit.context):
        if cuda_ctx:
            cuda_ctx.push()

        batch_size = image_np_array.shape[0]
        # 动态输入
        origin_inputshape = self.context.get_binding_shape(0)
        origin_inputshape[0] = batch_size
        self.context.set_binding_shape(0, (origin_inputshape))
        self.context.set_optimization_profile_async(0, self.stream.handle)
        
        self.inputs, self.outputs, bindings = allocate_buffers(self.inputs, self.outputs, self.engine, self.context, self.stream)
        np_type = trt.nptype(self.engine.get_binding_dtype(0))
        # Do inference
        self.inputs[0].host = np.ascontiguousarray(image_np_array.astype(np_type))
        trt_outputs = do_inference(self.context, bindings=bindings, inputs=self.inputs, outputs=self.outputs,
                                          stream=self.stream)
        
        if cuda_ctx:
            cuda_ctx.pop()
        
        nInput = np.sum([self.engine.binding_is_input(i) for i in range(self.engine.num_bindings)])
        nOutput = self.engine.num_bindings - nInput
        for i in range(nOutput):
            shape = self.context.get_binding_shape(nInput + i)
            trt_outputs[i] = trt_outputs[i].reshape(shape)
        return trt_outputs
    
    def __call__(self, x):
        return self.detect(x)
    
    def __del__(self):
        del self.inputs
        del self.outputs
        del self.stream
        del self.engine
        del self.context

## 融合Gelu试试

In [None]:
import onnx_graphsurgeon as gs
import onnx
import numpy as np

In [None]:
onnx_graph = onnx.load('convnext_tiny_rm_gamma_rep_layernorm_gs.onnx')

In [None]:
onnx_gs_graph = gs.import_onnx(onnx_graph)

In [None]:
gelu_idx = 0
for node in onnx_gs_graph.nodes:
    
    # 替换gamma
    if node.op != 'Add':
        continue
        
    try:
        mul_node = node.o(1)
    except:
        continue
    if mul_node.op != 'Mul':
        continue
    
    # mul 0.5 node
    mul_node = mul_node.o(0)
    
    if mul_node.op != 'Mul':
        continue
    
    gelu_idx += 1
    gelu_name = 'Gelu-%d' % gelu_idx
    gelu_node = gs.Node('Gelu', name=gelu_name, inputs = node.outputs[0:1], outputs = mul_node.outputs[0:1])
    
    mul_node.outputs.clear()
#     node.inputs.clear()
    onnx_gs_graph.nodes.append(gelu_node)

In [None]:
onnx_gs_graph = onnx_gs_graph.cleanup().toposort()

In [None]:
onnx.save(gs.export_onnx(onnx_gs_graph), "convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.onnx")

In [None]:
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny.onnx" --workspace=11000 --saveEngine=convnext_tiny.trt --plugins="gelu.so" --plugins="layernorm_plugin.so"

In [None]:
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.onnx" --workspace=11000 --saveEngine=convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.trt --plugins="gelu.so" --plugins="layernorm_plugin.so"

## layernorm剩余

首先查看目前模型融合情况

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import pandas as pd
from trex import *

# Configure a wider output (for the wide graphs)
set_wide_display()

In [None]:
plan = EnginePlan('./layer.json', './profile.json', './profile.metadata.json')

In [None]:
formatter = layer_type_formatter if True else precision_formatter
graph = to_dot(plan, formatter)
svg_name = render_dot(graph, 'convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.engine', 'svg')

In [None]:
import onnx_graphsurgeon as gs
import onnx
import numpy as np

In [None]:
onnx_graph = onnx.load('convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.onnx')
onnx_gs_graph = gs.import_onnx(onnx_graph)

# 合并LayeNorm
layernorm_idx = 0
for node in onnx_gs_graph.nodes:
    
    if node.op != 'ReduceMean':
        continue
    try:
        sub_nodes = list()
        for i in range(2):
            sub_nodes.append(node.o(i))
    except:
        pass
    if not sub_nodes or sub_nodes[0].op != 'Sub':
        continue
    
    div_node = None
    pow_node = None
    for sub_node in sub_nodes:
        if sub_node.op != 'Sub':
            continue
        try:
            for i in range(2):
                tmp_node = sub_node.o(i)
                if tmp_node.op == "Div":
                    div_node = tmp_node
                elif tmp_node.op == "Pow":
                    pow_node = tmp_node
        except:
            pass

    if div_node is None or pow_node is None:
        continue
    
    try:
        mul_node = div_node.o(0)
    except:
        continue
    if mul_node.op != 'Mul':
        continue
        
    try:
        add_node = mul_node.o(0)
    except:
        continue
    if add_node.op != 'Add':
        continue
    
    
    eps_node = pow_node.o(0).o(0)
    eps = eps_node.inputs[1].inputs[0].attrs['value'].values
    try:
        weight = mul_node.inputs[1].values
    except:
        weight = mul_node.inputs[0].values
        
    try:
        bias = add_node.inputs[0].values
    except:
        bias = add_node.inputs[1].values
    
    data_format = "channels_last" if int(node.attrs['axes'][0]) == -1 else "channels_first"
    if data_format != "channels_first":
        continue
    attrs = {
        'data_format':data_format,
        'eps':float(eps)
    }
    
    # 创造transpose节点
    layernorm_idx += 1
    layernorm_name = 'LayerNorm-CHFirst-%d' % layernorm_idx
    print('layernorm_name', layernorm_name)
    
    pre_transpose_name = 'PreTranspose-%d' % layernorm_idx
    pre_transpose_output = gs.Variable(name = pre_transpose_name + '_output')
    pre_transpose_node = gs.Node('Transpose', name=pre_transpose_name, attrs={'perm':np.int64([0,2,3,1])}, inputs = node.inputs[0:1], outputs = [pre_transpose_output])
    
    layernorm_output = gs.Variable(name = layernorm_name + '_output')
    weight_const = gs.Constant(name=layernorm_name+ "_weight", values=weight.reshape(-1))
    bias_const = gs.Constant(name=layernorm_name+ "_bias", values=bias.reshape(-1))
    new_layernorm_node = gs.Node('LayerNorm', name=layernorm_name, attrs=attrs, inputs = [pre_transpose_output, weight_const, bias_const], outputs = [layernorm_output])
    
    post_transpose_name = 'PostTranspose-%d' % layernorm_idx
    post_transpose_output = gs.Variable(name = post_transpose_name + '_output')
    post_transpose_node = gs.Node('Transpose', name=post_transpose_name, attrs={'perm':np.int64([0,3,1,2])}, inputs = [layernorm_output], outputs = add_node.outputs[0:1])


    add_node.outputs.clear()
    node.inputs.clear()
    sub_node.inputs.clear()
    onnx_gs_graph.nodes.append(new_layernorm_node)
    onnx_gs_graph.nodes.append(pre_transpose_node)
    onnx_gs_graph.nodes.append(post_transpose_node)
    

onnx_gs_graph = onnx_gs_graph.cleanup().toposort()
onnx.save(gs.export_onnx(onnx_gs_graph), "convnext_tiny_rm_gamma_rep_layernorm_gs_2.onnx")

In [None]:
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_rm_gamma_rep_layernorm_gs_2.onnx" --workspace=11000 --dumpRefit --dumpProfile --profilingVerbosity=detailed --dumpLayerInfo --exportLayerInfo=layer.json --exportProfile=profile.json --saveEngine=convnext_tiny_rm_gamma_rep_layernorm_gs_2.engine --verbose --plugins="gelu.so" --plugins="layernorm_plugin.so"
        
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_fuse_gamma.onnx" --workspace=11000 --dumpRefit --dumpProfile --profilingVerbosity=detailed --dumpLayerInfo --exportLayerInfo=layer.json --exportProfile=profile.json --saveEngine=convnext_tiny_fuse_gamma.trt --verbose --plugins="gelu.so" --plugins="layernorm_plugin.so"
            

In [None]:
onnx_graph = onnx.load('convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.onnx')
onnx_gs_graph = gs.import_onnx(onnx_graph)

# 合并LayeNorm
layernorm_idx = 0
for node in onnx_gs_graph.nodes:
    
    if node.op != 'ReduceMean':
        continue
    try:
        sub_nodes = list()
        for i in range(2):
            sub_nodes.append(node.o(i))
    except:
        pass
    if not sub_nodes or sub_nodes[0].op != 'Sub':
        continue
    
    div_node = None
    pow_node = None
    for sub_node in sub_nodes:
        if sub_node.op != 'Sub':
            continue
        try:
            for i in range(2):
                tmp_node = sub_node.o(i)
                if tmp_node.op == "Div":
                    div_node = tmp_node
                elif tmp_node.op == "Pow":
                    pow_node = tmp_node
        except:
            pass

    if div_node is None or pow_node is None:
        continue
    
    try:
        mul_node = div_node.o(0)
    except:
        continue
    if mul_node.op != 'Mul':
        continue
        
    try:
        add_node = mul_node.o(0)
    except:
        continue
    if add_node.op != 'Add':
        continue
    
    
    eps_node = pow_node.o(0).o(0)
    eps = eps_node.inputs[1].inputs[0].attrs['value'].values
    try:
        weight = mul_node.inputs[1].values
    except:
        weight = mul_node.inputs[0].values
        
    try:
        bias = add_node.inputs[0].values
    except:
        bias = add_node.inputs[1].values
    
    data_format = "channels_last" if int(node.attrs['axes'][0]) == -1 else "channels_first"
    if data_format != "channels_first":
        continue
    attrs = {
        'data_format':data_format,
        'eps':float(eps)
    }
    
    # 创造transpose节点
    layernorm_idx += 1
    layernorm_name = 'LayerNorm_CHFirst-%d' % layernorm_idx
    print('layernorm_name', layernorm_name)
    
    weight_const = gs.Constant(name=layernorm_name+ "_weight", values=weight.reshape(-1))
    bias_const = gs.Constant(name=layernorm_name+ "_bias", values=bias.reshape(-1))
    new_layernorm_node = gs.Node('LayerNorm_CHFirst', name=layernorm_name, attrs=attrs, inputs = [node.inputs[0], weight_const, bias_const], outputs = add_node.outputs[0:1])
    
    add_node.outputs.clear()
    node.inputs.clear()
    sub_node.inputs.clear()
    onnx_gs_graph.nodes.append(new_layernorm_node)
    

onnx_gs_graph = onnx_gs_graph.cleanup().toposort()
onnx.save(gs.export_onnx(onnx_gs_graph), "convnext_tiny_rm_gamma_rep_layernorm_gs_3.onnx")

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import pandas as pd
from trex import *

# Configure a wider output (for the wide graphs)
set_wide_display()

plan = EnginePlan('./layer.json', './profile.json', './profile.metadata.json')

formatter = layer_type_formatter if True else precision_formatter
graph = to_dot(plan, formatter)
svg_name = render_dot(graph, 'convnext_tiny_rm_gamma_rep_layernorm_gelu_gs_2.engine', 'svg')

In [None]:
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_rm_gamma_rep_layernorm_gs_3.onnx" --workspace=11000 --dumpRefit --dumpProfile --profilingVerbosity=detailed --dumpLayerInfo --exportLayerInfo=layer.json --exportProfile=profile.json --saveEngine=convnext_tiny_rm_gamma_rep_layernorm_gs_3.engine --verbose --plugins="gelu.so" --plugins="layernorm_plugin.so" --plugins="layernorm_chfirst_plugin.so"

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import pandas as pd
from trex import *

# Configure a wider output (for the wide graphs)
set_wide_display()

plan = EnginePlan('./layer.json', './profile.json', './profile.metadata.json')

formatter = layer_type_formatter if True else precision_formatter
graph = to_dot(plan, formatter)
svg_name = render_dot(graph, 'convnext_tiny_rm_gamma_rep_layernorm_gs_3.engine', 'svg')

In [None]:
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.onnx" --workspace=11000 --dumpRefit --dumpProfile --profilingVerbosity=detailed --dumpLayerInfo --exportLayerInfo=layer.json --exportProfile=profile.json --saveEngine=convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.trt --verbose --plugins="gelu.so" --plugins="layernorm_plugin.so" --plugins="layernorm_chfirst_plugin.so"

In [None]:
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.onnx" --workspace=11000 --saveEngine=convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.engine --plugins="gelu.so" --plugins="layernorm_plugin.so"
        
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny.onnx" --workspace=11000 --saveEngine=convnext_tiny.trt --plugins="gelu.so" --plugins="layernorm_plugin.so"

In [None]:
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_rm_gamma_rep_layernorm_gs.onnx" --workspace=11000 --saveEngine=convnext_tiny_rm_gamma_rep_layernorm_gs.engine --plugins="gelu.so" --plugins="layernorm_plugin.so" --plugins="layernorm_chfirst_plugin.so"

In [None]:
trt_cls_object = TRTClassify('./convnext_tiny.trt')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object

trt_cls_object = TRTClassify('./convnext_tiny_fuse_gamma.trt')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object

trt_cls_object = TRTClassify('./convnext_tiny_rm_gamma_rep_layernorm_gs.engine')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object

trt_cls_object = TRTClassify('./convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.trt')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object


inputs = np.random.rand(256,3,224,224).astype(np.float32)
trt_cls_object = TRTClassify('./convnext_tiny_rm_gamma_rep_layernorm_gs_2.engine')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object

trt_cls_object = TRTClassify('./convnext_tiny_rm_gamma_rep_layernorm_gs_3.engine')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object

In [None]:
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_rm_gamma_rep_layernorm_gs_2.onnx" --workspace=11000 --saveEngine=convnext_tiny_rm_gamma_rep_layernorm_gs_2-fp16.engine --plugins="gelu.so" --plugins="layernorm_plugin.so" --plugins="layernorm_chfirst_plugin.so" --fp16
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_rm_gamma_rep_layernorm_gs_3.onnx" --workspace=11000 --saveEngine=convnext_tiny_rm_gamma_rep_layernorm_gs_3-fp16.engine --plugins="gelu.so" --plugins="layernorm_plugin.so" --plugins="layernorm_chfirst_plugin.so" --fp16
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_fuse_gamma.onnx" --workspace=11000 --saveEngine=convnext_tiny_fuse_gamma-fp16.engine --plugins="gelu.so" --plugins="layernorm_plugin.so" --plugins="layernorm_chfirst_plugin.so" --fp16
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny_rm_gamma_rep_layernorm_gelu_gs.onnx" --workspace=11000 --saveEngine=convnext_tiny_rm_gamma_rep_layernorm_gelu_gs-fp16.engine --plugins="gelu.so" --plugins="layernorm_plugin.so" --plugins="layernorm_chfirst_plugin.so" --fp16
!trtexec --minShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --optShapes=input:256x3x224x224 --onnx="./convnext_tiny.onnx" --workspace=11000 --saveEngine=convnext_tiny-fp16.engine --plugins="gelu.so" --plugins="layernorm_plugin.so" --plugins="layernorm_chfirst_plugin.so" --fp16

In [None]:
inputs = np.random.rand(256,3,224,224).astype(np.float32)
trt_cls_object = TRTClassify('./convnext_tiny_rm_gamma_rep_layernorm_gs_2-fp16')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object

trt_cls_object = TRTClassify('./convnext_tiny_rm_gamma_rep_layernorm_gs_3-fp16.engine')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object

trt_cls_object = TRTClassify('./convnext_tiny_fuse_gamma-fp16.engine')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object

trt_cls_object = TRTClassify('./convnext_tiny_rm_gamma_rep_layernorm_gelu_gs-fp16.engine')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object

trt_cls_object = TRTClassify('./convnext_tiny-fp16.engine')
for i in range(100):
    trt_cls_object(inputs)
%timeit trt_cls_object(inputs)
del trt_cls_object