In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import tensorrt as trt

In [9]:
from cifar10_models.vgg import vgg11_bn, vgg13_bn, vgg16_bn, vgg19_bn

# Untrained model
model = vgg19_bn()

# Pretrained model
model = vgg19_bn(pretrained=True)
model.eval() # for evaluation

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256

In [None]:
dummy_input = torch.randn(1, 3, 32, 32)

onnx_file_path = "vgg19_bn.onnx"
torch.onnx.export(
    model,                  
    dummy_input,            
    onnx_file_path,         
    export_params=True, 
    opset_version=11,       
    do_constant_folding=True, 
)

print(f"Model exported to {onnx_file_path}")

Model exported to vgg19_bn.onnx


In [None]:
for _ in range(10):
    model(dummy_input)

In [126]:
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

with torch.no_grad():
    start_event.record()
    output = model(dummy_input)
    end_event.record()
    
torch.cuda.synchronize()

elapsed_time = start_event.elapsed_time(end_event)
print(elapsed_time/1000)

0.019340063095092775


In [18]:
!trtexec --onnx="vgg19_bn.onnx" --saveEngine="full_bit.trt"

&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=vgg19_bn.onnx --saveEngine=full_bit.trt
[12/23/2024-04:03:10] [I] === Model Options ===
[12/23/2024-04:03:10] [I] Format: ONNX
[12/23/2024-04:03:10] [I] Model: vgg19_bn.onnx
[12/23/2024-04:03:10] [I] Output:
[12/23/2024-04:03:10] [I] === Build Options ===
[12/23/2024-04:03:10] [I] Max batch: explicit batch
[12/23/2024-04:03:10] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[12/23/2024-04:03:10] [I] minTiming: 1
[12/23/2024-04:03:10] [I] avgTiming: 8
[12/23/2024-04:03:10] [I] Precision: FP32
[12/23/2024-04:03:10] [I] LayerPrecisions: 
[12/23/2024-04:03:10] [I] Layer Device Types: 
[12/23/2024-04:03:10] [I] Calibration: 
[12/23/2024-04:03:10] [I] Refit: Disabled
[12/23/2024-04:03:10] [I] Version Compatible: Disabled
[12/23/2024-04:03:10] [I] TensorRT runtime: full
[12/23/2024-04:03:10] [I] Lean DLL Path: 
[12/23/2024-04:03:10] [I] Tempfile Controls: { in_memory: allow, 

In [136]:
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit  # Auto initialize PyCUDA

def load_engine(engine_file_path):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

def allocate_buffers_without_stream(engine):
    inputs = []
    outputs = []
    bindings = []

    for binding in engine:
        size = trt.volume(engine.get_tensor_shape(binding)) 
        dtype = trt.nptype(engine.get_tensor_dtype(binding))
        device_mem = cuda.mem_alloc(size * np.dtype(dtype).itemsize)
        bindings.append(int(device_mem))
        if engine.binding_is_input(binding):
            inputs.append(device_mem)
        else:
            outputs.append(device_mem)
    return inputs, outputs, bindings

def infer_without_stream(inputs, outputs, bindings, input_data, context):
    cuda.memcpy_htod(inputs[0], input_data.ravel())
    context.execute_v2(bindings)
    output_data = np.empty((1, 10), dtype=np.float32) 
    cuda.memcpy_dtoh(output_data, outputs[0])
    return output_data


In [137]:
engine_file_path = "8bit.trt"
# engine_file_path = "16bit.trt"
# engine_file_path = "full_bit.trt"

engine = load_engine(engine_file_path)
results = []
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

inputs, outputs, bindings = allocate_buffers_without_stream(engine)
context = engine.create_execution_context()
input_data = np.random.random((1, 3, 32, 32)).astype(np.float32)

# Warm-up
for _ in range(10):
    infer_without_stream(inputs, outputs, bindings, input_data, context)

for _ in range(30):
    start_event.record()
    output = infer_without_stream(inputs, outputs, bindings, input_data, context)
    end_event.record()
    torch.cuda.synchronize()

    elapsed_time = start_event.elapsed_time(end_event)
    results.append(elapsed_time)

mean_result = np.mean(results)
print("inference time : ", mean_result/1000)


inference time :  0.0003026933342218399


  if engine.binding_is_input(binding):


In [77]:
!trtexec --onnx="vgg19_bn.onnx" --saveEngine="16bit.trt" --fp16

&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=vgg19_bn.onnx --saveEngine=16bit.trt --fp16
[12/23/2024-04:24:38] [I] === Model Options ===
[12/23/2024-04:24:38] [I] Format: ONNX
[12/23/2024-04:24:38] [I] Model: vgg19_bn.onnx
[12/23/2024-04:24:38] [I] Output:
[12/23/2024-04:24:38] [I] === Build Options ===
[12/23/2024-04:24:38] [I] Max batch: explicit batch
[12/23/2024-04:24:38] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[12/23/2024-04:24:38] [I] minTiming: 1
[12/23/2024-04:24:38] [I] avgTiming: 8
[12/23/2024-04:24:38] [I] Precision: FP32+FP16
[12/23/2024-04:24:38] [I] LayerPrecisions: 
[12/23/2024-04:24:38] [I] Layer Device Types: 
[12/23/2024-04:24:38] [I] Calibration: 
[12/23/2024-04:24:38] [I] Refit: Disabled
[12/23/2024-04:24:38] [I] Version Compatible: Disabled
[12/23/2024-04:24:38] [I] TensorRT runtime: full
[12/23/2024-04:24:38] [I] Lean DLL Path: 
[12/23/2024-04:24:38] [I] Tempfile Controls: { in_memory

In [76]:
!trtexec --onnx="vgg19_bn.onnx" --saveEngine="8bit.trt" --int8

&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=vgg19_bn.onnx --saveEngine=8bit.trt --int8
[12/23/2024-04:23:58] [I] === Model Options ===
[12/23/2024-04:23:58] [I] Format: ONNX
[12/23/2024-04:23:58] [I] Model: vgg19_bn.onnx
[12/23/2024-04:23:58] [I] Output:
[12/23/2024-04:23:58] [I] === Build Options ===
[12/23/2024-04:23:58] [I] Max batch: explicit batch
[12/23/2024-04:23:58] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[12/23/2024-04:23:58] [I] minTiming: 1
[12/23/2024-04:23:58] [I] avgTiming: 8
[12/23/2024-04:23:58] [I] Precision: FP32+INT8
[12/23/2024-04:23:58] [I] LayerPrecisions: 
[12/23/2024-04:23:58] [I] Layer Device Types: 
[12/23/2024-04:23:58] [I] Calibration: Dynamic
[12/23/2024-04:23:58] [I] Refit: Disabled
[12/23/2024-04:23:58] [I] Version Compatible: Disabled
[12/23/2024-04:23:58] [I] TensorRT runtime: full
[12/23/2024-04:23:58] [I] Lean DLL Path: 
[12/23/2024-04:23:58] [I] Tempfile Controls: { in_

In [99]:
from torchvision import datasets, transforms

def load_cifar10_data(batch_size=1):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2471, 0.2435, 0.2616)),
    ])

    testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False)
    return testloader

In [100]:
def evaluate_cifar10(engine_file_path, batch_size=1):
    engine = load_engine(engine_file_path)
    inputs, outputs, bindings = allocate_buffers_without_stream(engine)
    context = engine.create_execution_context()

    testloader = load_cifar10_data(batch_size=batch_size)

    correct = 0
    total = 0

    for images, labels in testloader:
        images = images.numpy() 
        batch_size = images.shape[0]
        images = images.astype(np.float32) 

        for i in range(batch_size):
            output = infer_without_stream(engine, inputs, outputs, bindings, images[i:i+1], context)

            pred = np.argmax(output, axis=1)
            correct += (pred == labels[i].item())
            total += 1

    accuracy = correct / total
    print(f"Accuracy: {accuracy.item() * 100:.2f}%")


In [110]:
engine_file_path = "/workspace/aimet/bootcamp/new_int8.trt"  # Change to your engine file
evaluate_cifar10(engine_file_path)

  size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
  size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
  dtype = trt.nptype(engine.get_binding_dtype(binding))
  if engine.binding_is_input(binding):


[12/23/2024-05:17:27] [TRT] [W] The getMaxBatchSize() function should not be used with an engine built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. This function will always return 1.
[12/23/2024-05:17:27] [TRT] [W] The getMaxBatchSize() function should not be used with an engine built from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag. This function will always return 1.
Files already downloaded and verified
Accuracy: 93.90%


In [104]:
!git clone -b TensorRT https://github.com/yundogyeong/bootcamp.git

Cloning into 'bootcamp'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 21 (delta 6), reused 7 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (21/21), 11.46 KiB | 11.46 MiB/s, done.
Resolving deltas: 100% (6/6), done.
