# So sánh việc dùng trtexec và TensorRT Python API

| Tiêu chí             | `trtexec`               | TensorRT Python API                  |
| -------------------- | ----------------------- | ------------------------------------ |
| Dễ dùng              | ✅ rất dễ                | ❌ cần học API                        |
| Dùng thử nhanh       | ✅                       | ❌                                    |
| Benchmark            | ✅ rất tốt               | ✅ (phức tạp hơn)                     |
| Production inference | ❌ không phù hợp         | ✅ chuẩn production                   |
| Control buffer       | ❌ không có              | ✅ full control                       |
| Dynamic batching     | ❌                       | ✅                                    |
| Debug timing         | ✅ rất mạnh              | ✅ nhưng phải tự log                  |
| Cài đặt              | Có sẵn khi cài TensorRT | Cần thêm pycuda, TensorRT Python SDK |


# Convert model thành engine file bằng TensorRT

## Cách 1: Dùng trtexec

In [None]:
# !wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx

In [6]:
### 1️⃣ Kiểm tra file ONNX hợp lệ

import onnx
import os

onnx_path = "resnet50-v1-12.onnx"

# Kiểm tra file có tồn tại không
if not os.path.isfile(onnx_path):
    raise FileNotFoundError(f"Không tìm thấy file {onnx_path}")

# Load và kiểm tra model
onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)
print("✅ ONNX model hợp lệ!")

✅ ONNX model hợp lệ!


In [7]:
### 2️⃣ Kiểm tra trtexec có sẵn

!which trtexec || echo "⚠️ trtexec chưa được cài đặt hoặc không nằm trong PATH."

/opt/tensorrt/bin/trtexec


In [8]:
### 3️⃣ Convert ONNX sang TensorRT Engine
# --onnx: Chỉ định file onnx.
# --saveEngine: Đầu ra file engine.
# --fp16: Bật mixed precision (FP16), tăng tốc độ nhưng vẫn giữ độ chính xác khá cao.
# Có thể thêm --int8 nếu đã calibrate mô hình.

!trtexec --onnx=resnet50-v1-12.onnx --saveEngine=resnet50.engine --fp16

&&&& RUNNING TensorRT.trtexec [TensorRT v8603] # trtexec --onnx=resnet50-v1-12.onnx --saveEngine=resnet50.engine --fp16
[06/15/2025-07:21:23] [I] === Model Options ===
[06/15/2025-07:21:23] [I] Format: ONNX
[06/15/2025-07:21:23] [I] Model: resnet50-v1-12.onnx
[06/15/2025-07:21:23] [I] Output:
[06/15/2025-07:21:23] [I] === Build Options ===
[06/15/2025-07:21:23] [I] Max batch: explicit batch
[06/15/2025-07:21:23] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[06/15/2025-07:21:23] [I] minTiming: 1
[06/15/2025-07:21:23] [I] avgTiming: 8
[06/15/2025-07:21:23] [I] Precision: FP32+FP16
[06/15/2025-07:21:23] [I] LayerPrecisions: 
[06/15/2025-07:21:23] [I] Layer Device Types: 
[06/15/2025-07:21:23] [I] Calibration: 
[06/15/2025-07:21:23] [I] Refit: Disabled
[06/15/2025-07:21:23] [I] Version Compatible: Disabled
[06/15/2025-07:21:23] [I] ONNX Native InstanceNorm: Disabled
[06/15/2025-07:21:23] [I] TensorRT runtime: full
[06/15/2025-07:21:2

In [9]:
### 4️⃣ Kiểm tra file engine sau khi build

engine_path = "resnet50.engine"

if os.path.isfile(engine_path):
    print("✅ TensorRT engine đã được tạo thành công!")
else:
    raise FileNotFoundError("❌ TensorRT engine chưa được tạo.")

✅ TensorRT engine đã được tạo thành công!


In [10]:
### 5️⃣ Benchmark TensorRT Engine
# --loadEngine: Chạy inference benchmark trên engine đã build
# --iterations=100: Chạy 100 lần để đánh giá tốc độ

!trtexec --loadEngine=resnet50.engine --iterations=100


&&&& RUNNING TensorRT.trtexec [TensorRT v8603] # trtexec --loadEngine=resnet50.engine --iterations=100
[06/15/2025-07:25:14] [I] === Model Options ===
[06/15/2025-07:25:14] [I] Format: *
[06/15/2025-07:25:14] [I] Model: 
[06/15/2025-07:25:14] [I] Output:
[06/15/2025-07:25:14] [I] === Build Options ===
[06/15/2025-07:25:14] [I] Max batch: 1
[06/15/2025-07:25:14] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[06/15/2025-07:25:14] [I] minTiming: 1
[06/15/2025-07:25:14] [I] avgTiming: 8
[06/15/2025-07:25:14] [I] Precision: FP32
[06/15/2025-07:25:14] [I] LayerPrecisions: 
[06/15/2025-07:25:14] [I] Layer Device Types: 
[06/15/2025-07:25:14] [I] Calibration: 
[06/15/2025-07:25:14] [I] Refit: Disabled
[06/15/2025-07:25:14] [I] Version Compatible: Disabled
[06/15/2025-07:25:14] [I] ONNX Native InstanceNorm: Disabled
[06/15/2025-07:25:14] [I] TensorRT runtime: full
[06/15/2025-07:25:14] [I] Lean DLL Path: 
[06/15/2025-07:25:14] [I] Tempfile

# Cách 2: Dùng TensorRT Python API

In [17]:
import tensorrt as trt
import os

# ==== CONFIG ==== #
ONNX_PATH = "resnet50-v1-12.onnx"
ENGINE_PATH = "resnet50.engine"
FP16_MODE = True
WORKSPACE_SIZE = 1 << 30  # 1GB

# Dynamic shape configs
MIN_BATCH = 1
OPT_BATCH = 4
MAX_BATCH = 16
HEIGHT = 224
WIDTH = 224

In [18]:
# ==== STEP 1: Setup TensorRT logger & builder ==== #
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(network_flags)
parser = trt.OnnxParser(network, TRT_LOGGER)

[06/15/2025-07:28:45] [TRT] [I] The logger passed into createInferBuilder differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
[06/15/2025-07:28:45] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 802, GPU 1151 (MiB)
[06/15/2025-07:28:50] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +879, GPU +172, now: CPU 1757, GPU 1323 (MiB)
[06/15/2025-07:28:50] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading


In [19]:
# ==== STEP 2: Parse ONNX model ==== #
if not os.path.isfile(ONNX_PATH):
    raise FileNotFoundError(f"Không tìm thấy file ONNX: {ONNX_PATH}")

with open(ONNX_PATH, 'rb') as f:
    if not parser.parse(f.read()):
        for idx in range(parser.num_errors):
            print(parser.get_error(idx))
        raise RuntimeError("❌ Không parse được ONNX!")

print("✅ Parse ONNX thành công!")

[06/15/2025-07:28:55] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
✅ Parse ONNX thành công!


In [20]:
# ==== STEP 3: Xác định input/output model ==== #
input_tensor = network.get_input(0)
print(f"Input tensor: {input_tensor.name}, shape: {input_tensor.shape}")
output_tensor = network.get_output(0)
print(f"Output tensor: {output_tensor.name}, shape: {output_tensor.shape}")

Input tensor: data, shape: (-1, 3, 224, 224)
Output tensor: resnetv17_dense0_fwd, shape: (-1, 1000)


In [21]:
# ==== STEP 4: Create builder config và set FP16 ==== #
config = builder.create_builder_config()
config.max_workspace_size = WORKSPACE_SIZE
if FP16_MODE:
    config.set_flag(trt.BuilderFlag.FP16)

  config.max_workspace_size = WORKSPACE_SIZE


In [22]:
# ==== STEP 5: Tạo Optimization Profile ==== #
profile = builder.create_optimization_profile()
input_name = input_tensor.name
profile.set_shape(input_name,
                  (MIN_BATCH, 3, HEIGHT, WIDTH),
                  (OPT_BATCH, 3, HEIGHT, WIDTH),
                  (MAX_BATCH, 3, HEIGHT, WIDTH))
config.add_optimization_profile(profile)

0

In [23]:
# ==== STEP 6: Build engine ==== #
print("⚙️ Bắt đầu build TensorRT engine...")
engine = builder.build_engine(network, config)
if engine is None:
    raise RuntimeError("❌ Build engine thất bại!")

⚙️ Bắt đầu build TensorRT engine...
[06/15/2025-07:29:05] [TRT] [I] BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
[06/15/2025-07:29:05] [TRT] [I] Graph optimization time: 0.0465545 seconds.
[06/15/2025-07:29:05] [TRT] [I] BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
[06/15/2025-07:29:05] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.


  engine = builder.build_engine(network, config)


[06/15/2025-07:29:38] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[06/15/2025-07:29:38] [TRT] [I] Total Host Persistent Memory: 264368
[06/15/2025-07:29:38] [TRT] [I] Total Device Persistent Memory: 155136
[06/15/2025-07:29:38] [TRT] [I] Total Scratch Memory: 2753536
[06/15/2025-07:29:38] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 138 MiB, GPU 172 MiB
[06/15/2025-07:29:38] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 67 steps to complete.
[06/15/2025-07:29:38] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 0.679694ms to assign 4 blocks to 67 nodes requiring 61014016 bytes.
[06/15/2025-07:29:38] [TRT] [I] Total Activation Memory: 61014016
[06/15/2025-07:29:38] [TRT] [W] TensorRT encountered issues when converting weights between types and that could affect accuracy.
[06/15/2025-07:29:38] [TRT] [W] If this is not the desired behavior, please modify the weights or retrain with regularization to ad

In [24]:
# ==== STEP 7: Save engine to file ==== #
with open(ENGINE_PATH, 'wb') as f:
    f.write(engine.serialize())
print(f"✅ Engine đã lưu: {ENGINE_PATH}")


✅ Engine đã lưu: resnet50.engine


# So sánh kết quả của onnx và engine

### ONNX

In [11]:
import onnxruntime as ort
import numpy as np
import time
import pycuda.driver as cuda
import pycuda.autoinit

# Chuẩn bị dữ liệu đầu vào: batch size = 1
input_data = np.random.rand(1, 3, 224, 224).astype(np.float32)

########################################
# ONNX Inference
########################################

# Load model ONNX
onnx_path = "resnet50-v1-12.onnx"
ort_session = ort.InferenceSession(onnx_path, providers=['CPUExecutionProvider'])
input_name = ort_session.get_inputs()[0].name

# Benchmark ONNX
onnx_runs = 100
start_onnx = time.perf_counter()
for _ in range(onnx_runs):
    onnx_output = ort_session.run(None, {input_name: input_data})[0]
end_onnx = time.perf_counter()

onnx_time = (end_onnx - start_onnx) / onnx_runs
print(f"ONNX avg time: {onnx_time * 1000:.3f} ms")

ONNX avg time: 22.516 ms


### TensorRT

In [12]:
import tensorrt as trt
import os
########################################
# TensorRT Inference
########################################

# Load TensorRT Engine
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def load_engine(engine_path):
    with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())


engine_path = "resnet50.engine"
if not os.path.exists(engine_path):
    raise FileNotFoundError(f"Không tìm thấy file {engine_path}")
else:
    print("✅ Đã tìm thấy file engine")
    
engine = load_engine(engine_path)
context = engine.create_execution_context()

✅ Đã tìm thấy file engine
[06/15/2025-07:25:37] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading


In [15]:
# Allocate buffers
h_input = np.array(input_data, dtype=np.float32).ravel()
h_output = np.empty((1, 1000), dtype=np.float32)

d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
bindings = [int(d_input), int(d_output)]

# Warm-up TensorRT trước khi benchmark
for _ in range(10):
    cuda.memcpy_htod(d_input, h_input)
    context.execute_v2(bindings)
    cuda.memcpy_dtoh(h_output, d_output)

# Benchmark TensorRT
trt_runs = 100
start_trt = time.perf_counter()
for _ in range(trt_runs):
    cuda.memcpy_htod(d_input, h_input)
    context.execute_v2(bindings)
    cuda.memcpy_dtoh(h_output, d_output)
end_trt = time.perf_counter()

trt_time = (end_trt - start_trt) / trt_runs
print(f"TensorRT avg time: {trt_time * 1000:.3f} ms")

TensorRT avg time: 2.591 ms


In [16]:
########################################
# So sánh output
########################################

diff = np.abs(onnx_output.flatten() - h_output.flatten())
max_diff = np.max(diff)
mean_diff = np.mean(diff)

print(f"Max difference: {max_diff}")
print(f"Mean difference: {mean_diff}")

Max difference: 0.012323379516601562
Mean difference: 0.0030130245722830296
