### Pytorch model --> TensorRT examples

#### 1. Load and launch a pretrained model using Pytorch

**Load model**

In [1]:
from torchvision import models
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)

**Define preprocessing step**

In [2]:
import cv2
import torch
from albumentations import Resize, Compose
from albumentations.pytorch.transforms import ToTensorV2
from albumentations.augmentations.transforms import Normalize

In [3]:
def preprocess_image(img_path):
    # Transform for image
    transforms = Compose([
        Resize(224, 224, interpolation=cv2.INTER_NEAREST),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])
    
    # Read image with cv2
    input_img = cv2.imread(img_path)
    
    # Transform image
    input_data = transforms(image=input_img)['image']
    
    # Convert to batch 1 image
    batch_data = torch.unsqueeze(input_data, 0)
    return batch_data

input = preprocess_image("resources/turkish_coffee.jpg").cuda()

**Inference and Postprocess**

In [4]:
# Inference with model
model.eval()
model.cuda()
output = model(input)

In [5]:
def postprocess(output_data):
    # Get class name
    with open('resources/imagenet_classes.txt') as f:
        classes = [line.split(',')[1].strip() for line in f.readlines()]
    
    # Calculate score  
    confidences = torch.nn.functional.softmax(output_data, dim=1)[0] * 100
    
    # Find top predicted classes
    _, indices = torch.sort(output_data, descending=True)
    i = 0
    
    while confidences[indices[0][i]] > 0.5:
        class_idx = indices[0][i]
        
        print(
            "Class:",
            classes[class_idx],
            ", confidence:",
            confidences[class_idx].item(),
            "%, index:",
            class_idx.item()
        )
        
        i += 1
        
postprocess(output)
        


Class: cup , confidence: 94.98472595214844 %, index: 968
Class: espresso , confidence: 3.7401280403137207 %, index: 967
Class: coffee_mug , confidence: 0.6143011450767517 %, index: 504


#### 2. Convert the pytorch model to ONNX format

In [7]:
ONNX_FILE_PATH = 'results/resnet50.onnx'

dynamic_axes = {
    "input":{
        0: "batch_size"
    }, 
    "output":{
        0: "batch_size"
    }
}

torch.onnx.export(
    model,
    input,
    ONNX_FILE_PATH,
    input_names=['input'],
    output_names=['output'],
    export_params=True,
    dynamic_axes=dynamic_axes
)

In [8]:
import onnx

onnx_model = onnx.load(ONNX_FILE_PATH)
onnx.checker.check_model(onnx_model)

#### 3. Visualize model

In [9]:
# Use netron to visualize onnx model
# !netron

#### 4. Initialize model in TensorRT

In [10]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt
import time

#### 5. Main pipeline

In [31]:
# Build TensorRT serialized engine and save it
def build_serialized_engine(logger, onnx_file_path, save_path, min_batch=1, opt_batch=8, max_batch=32, workspace_size=1):
    start = time.time()
    explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    
    # Initialize TensorRT engine and parse ONNX model
    with trt.Builder(logger) as builder, \
        builder.create_network(explicit_batch_flag) as network, \
        builder.create_builder_config() as config:
        
        # Parse ONNX to network
        parser = trt.OnnxParser(network, TRT_LOGGER)
        with open(onnx_file_path, 'rb') as model:
            print('Begin parsing ONNX file')
            parser.parse(model.read())
        print('Completed parsing ONNX model')

        # Config builder
        config = builder.create_builder_config()
        # allow TensorRT to use up to 1GB of GPU memory for tactic selection
        if workspace_size != 0:
            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_size * (1024 * 1024))
        # use FP16 mode if possible
        if builder.platform_has_fast_fp16:
            config.set_flag(trt.BuilderFlag.FP16)
        # Add optimization profile for 'input' (required when using dynamic batch size)
        profile = builder.create_optimization_profile()
        profile.set_shape("input", (min_batch, 3, 224, 224), (opt_batch, 3, 224, 224), (max_batch, 3, 224, 224))
        config.add_optimization_profile(profile)

        # generate TensorRT engine optimized for the target platform
        print('Building an engine...')
        serialized_engine = builder.build_serialized_network(network, config)
        # engine = runtime.deserialize_cuda_engine(plan)
        # context = engine.create_execution_context()
        print("Completed creating Engine after {:.2f}s".format(time.time() - start))
        
        with open(save_path, 'wb') as f:
            f.write(serialized_engine)
 
        print('Write serialize engine to {}!'.format(save_path))
    

# Deserialized engine
def build_deserialized_engine(logger, engine_path):
    
    # Load saved engine
    with open (engine_path, 'rb') as f:
        serialized_engine = f.read()
    
    # Deserialize engine
    with trt.Runtime(logger) as runtime:
        engine = runtime.deserialize_cuda_engine(serialized_engine)
    
    return engine

In [36]:
# Logger to capture errors, warnings and other infomation
TRT_LOGGER = trt.Logger()

# Build engine
SERIALIZED_ENGINE_PATH = 'results/resnet50.engine'

# Initialize TensorRT engine and parse ONNX model
# batch_size = 1
workspace_size = 2 # GB

build_serialized_engine(
    TRT_LOGGER,
    ONNX_FILE_PATH,
    SERIALIZED_ENGINE_PATH,
    workspace_size
)

Begin parsing ONNX file[02/24/2023-13:14:58] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars

Completed parsing ONNX model
Building an engine...
[02/24/2023-13:14:58] [TRT] [W] TensorRT was linked against cuDNN 8.6.0 but loaded cuDNN 8.5.0
[02/24/2023-13:16:00] [TRT] [W] Try increasing the workspace size to 4194304 bytes to get better performance.
[02/24/2023-13:16:00] [TRT] [W] Try increasing the workspace size to 4194304 bytes to get better performance.
[02/24/2023-13:16:00] [TRT] [W] Try increasing the workspace size to 4194304 bytes to get better performance.
[02/24/2023-13:16:00] [TRT] [W] Try increasing the workspace size to 4194304 bytes to get better performance.
[02/24/2023-13:16:00] [TRT] [W] Try increasing the workspace size to 4194304 bytes to get better performance.
[02/24/2023-13:16:01] [TRT] [W] Try increasing the wo

In [75]:
def create_context(engine, input_shape, stream_handle):
    context = engine.create_execution_context()
    context.set_optimization_profile_async(0, stream_handle)
    context.set_input_shape('input', trt.Dims4(list(input_shape)))
    return context

In [88]:
def main():
    
    # Load and deserialize engine
    print('Loading and deserializing engine')
    engine = build_deserialized_engine(TRT_LOGGER, SERIALIZED_ENGINE_PATH)
    
    # Inference batch size
    batch_size = 4
    sample_image = preprocess_image("resources/turkish_coffee.jpg").numpy()
    batch_images = np.concatenate([sample_image] * batch_size, axis=0)
    
    # Get sizes of input and output and allocate memory required for input and output data
    for idx, _tensor in enumerate(engine): # inputs and outputs
        name = engine.get_tensor_name(idx)
        
        if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: # in case it is input
            input_shape = engine.get_tensor_shape(_tensor)
            input_shape[0] = batch_size
            
            input_size = trt.volume(input_shape) * np.dtype(np.float32).itemsize # in bytes
            device_input = cuda.mem_alloc(input_size)
        else: # output
            output_shape = engine.get_tensor_shape(_tensor)
            output_shape[0] = batch_size
            
            # Create page-locked memory buffer (i.e. won't be swapped to disk)
            host_output = cuda.pagelocked_empty(trt.volume(output_shape), dtype=np.float32)
            device_output = cuda.mem_alloc(host_output.nbytes)
            
    # Create a stream in which to copy inputs/outputs and run inference
    stream = cuda.Stream()
    
    print('Inferencing')
    
    # Preprocess input data
    host_input = np.array(batch_images, dtype=np.float32, order='C')
    cuda.memcpy_htod_async(device_input, host_input, stream)
    
    # Create context
    context = create_context(engine, host_input.shape, stream.handle)
    
    # Run inference
    context.execute_async_v2(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_output, device_output, stream)
    stream.synchronize()
    
    # Post processing (return value in host_output is one dimension array)
    tensor_output = torch.Tensor(host_output)
    output_data = tensor_output.reshape(batch_size, int(tensor_output.shape[0] / batch_size))
    for i, output in enumerate(output_data):
        print('=== Image {} ==='.format(i))
        postprocess(output.reshape(1, -1))

In [89]:
import os
os.environ['CUDA_MODULE_LOADING']='LAZY'
main()

Loading and deserializing engine
Inferencing
[02/24/2023-13:31:55] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
=== Image 0 ===
Class: cup , confidence: 94.96162414550781 %, index: 968
Class: espresso , confidence: 3.7400457859039307 %, index: 967
Class: coffee_mug , confidence: 0.6201604008674622 %, index: 504
=== Image 1 ===
Class: cup , confidence: 94.96162414550781 %, index: 968
Class: espresso , confidence: 3.7400457859039307 %, index: 967
Class: coffee_mug , confidence: 0.6201604008674622 %, index: 504
=== Image 2 ===
Class: cup , confidence: 94.96162414550781 %, index: 968
Class: espresso , confidence: 3.7400457859039307 %, index: 967
Class: coffee_mug , confidence: 0.6201604008674622 %, index: 504
=== Image 3 ===
Class: cup , confidence: 94.96162414550781 %, index: 968
Class: espresso , confidence: 3.7400457859039307 %, 

In [91]:
[*(1, 2, 3)]

[1, 2, 3]