# Convert model thành engine file bằng TensorRT

## Cách 1: Dùng trtexec

In [None]:
# !wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx

In [None]:
### 1️⃣ Kiểm tra file ONNX hợp lệ

import onnx
import os

onnx_path = "resnet50-v1-12.onnx"

# Kiểm tra file có tồn tại không
if not os.path.isfile(onnx_path):
    raise FileNotFoundError(f"Không tìm thấy file {onnx_path}")

# Load và kiểm tra model
onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)
print("✅ ONNX model hợp lệ!")

In [None]:
### 2️⃣ Kiểm tra trtexec có sẵn

!which trtexec || echo "⚠️ trtexec chưa được cài đặt hoặc không nằm trong PATH."

In [None]:
### 3️⃣ Convert ONNX sang TensorRT Engine
# --onnx: Chỉ định file onnx.
# --saveEngine: Đầu ra file engine.
# --fp16: Bật mixed precision (FP16), tăng tốc độ nhưng vẫn giữ độ chính xác khá cao.
# Có thể thêm --int8 nếu đã calibrate mô hình.

!trtexec --onnx=resnet50-v1-12.onnx --saveEngine=resnet50.engine --fp16

In [None]:
### 4️⃣ Kiểm tra file engine sau khi build

engine_path = "resnet50.engine"

if os.path.isfile(engine_path):
    print("✅ TensorRT engine đã được tạo thành công!")
else:
    raise FileNotFoundError("❌ TensorRT engine chưa được tạo.")

In [None]:
### 5️⃣ Benchmark TensorRT Engine
# --loadEngine: Chạy inference benchmark trên engine đã build
# --iterations=100: Chạy 100 lần để đánh giá tốc độ

!trtexec --loadEngine=resnet50.engine --iterations=100


✅ ONNX model hợp lệ!
/opt/tensorrt/bin/trtexec
&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=resnet50-v1-12.onnx --saveEngine=resnet50.engine --fp16
[06/14/2025-08:27:49] [I] === Model Options ===
[06/14/2025-08:27:49] [I] Format: ONNX
[06/14/2025-08:27:49] [I] Model: resnet50-v1-12.onnx
[06/14/2025-08:27:49] [I] Output:
[06/14/2025-08:27:49] [I] === Build Options ===
[06/14/2025-08:27:49] [I] Max batch: explicit batch
[06/14/2025-08:27:49] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[06/14/2025-08:27:49] [I] minTiming: 1
[06/14/2025-08:27:49] [I] avgTiming: 8
[06/14/2025-08:27:49] [I] Precision: FP32+FP16
[06/14/2025-08:27:49] [I] LayerPrecisions: 
[06/14/2025-08:27:49] [I] Layer Device Types: 
[06/14/2025-08:27:49] [I] Calibration: 
[06/14/2025-08:27:49] [I] Refit: Disabled
[06/14/2025-08:27:49] [I] Version Compatible: Disabled
[06/14/2025-08:27:49] [I] TensorRT runtime: full
[06/14/2025-08:27:49] [I] Lean DL

# Cách 2: Dùng TensorRT Python API

In [8]:
import tensorrt as trt
import os

# ==== CONFIG ==== #
ONNX_PATH = "resnet50-v1-12.onnx"
ENGINE_PATH = "resnet50.engine"
FP16_MODE = True
WORKSPACE_SIZE = 1 << 30  # 1GB

# Dynamic shape configs
MIN_BATCH = 1
OPT_BATCH = 4
MAX_BATCH = 16
HEIGHT = 224
WIDTH = 224

In [9]:
# ==== STEP 1: Setup TensorRT logger & builder ==== #
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(network_flags)
parser = trt.OnnxParser(network, TRT_LOGGER)

[06/14/2025-09:31:54] [TRT] [I] [MemUsageChange] Init CUDA: CPU +518, GPU +0, now: CPU 679, GPU 1093 (MiB)
[06/14/2025-09:31:59] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +883, GPU +172, now: CPU 1638, GPU 1265 (MiB)
[06/14/2025-09:31:59] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading


In [10]:
# ==== STEP 2: Parse ONNX model ==== #
if not os.path.isfile(ONNX_PATH):
    raise FileNotFoundError(f"Không tìm thấy file ONNX: {ONNX_PATH}")

with open(ONNX_PATH, 'rb') as f:
    if not parser.parse(f.read()):
        for idx in range(parser.num_errors):
            print(parser.get_error(idx))
        raise RuntimeError("❌ Không parse được ONNX!")

print("✅ Parse ONNX thành công!")

[06/14/2025-09:31:59] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
✅ Parse ONNX thành công!


In [11]:
# ==== STEP 3: Xác định input/output model ==== #
input_tensor = network.get_input(0)
print(f"Input tensor: {input_tensor.name}, shape: {input_tensor.shape}")
output_tensor = network.get_output(0)
print(f"Output tensor: {output_tensor.name}, shape: {output_tensor.shape}")

Input tensor: data, shape: (-1, 3, 224, 224)
Output tensor: resnetv17_dense0_fwd, shape: (-1, 1000)


In [12]:
# ==== STEP 4: Create builder config và set FP16 ==== #
config = builder.create_builder_config()
config.max_workspace_size = WORKSPACE_SIZE
if FP16_MODE:
    config.set_flag(trt.BuilderFlag.FP16)

  config.max_workspace_size = WORKSPACE_SIZE


In [13]:
# ==== STEP 5: Tạo Optimization Profile ==== #
profile = builder.create_optimization_profile()
input_name = input_tensor.name
profile.set_shape(input_name,
                  (MIN_BATCH, 3, HEIGHT, WIDTH),
                  (OPT_BATCH, 3, HEIGHT, WIDTH),
                  (MAX_BATCH, 3, HEIGHT, WIDTH))
config.add_optimization_profile(profile)

0

In [14]:
# ==== STEP 6: Build engine ==== #
print("⚙️ Bắt đầu build TensorRT engine...")
engine = builder.build_engine(network, config)
if engine is None:
    raise RuntimeError("❌ Build engine thất bại!")

⚙️ Bắt đầu build TensorRT engine...
[06/14/2025-09:34:31] [TRT] [I] BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
[06/14/2025-09:34:31] [TRT] [I] Graph optimization time: 0.0477496 seconds.
[06/14/2025-09:34:31] [TRT] [I] BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
[06/14/2025-09:34:31] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.


  engine = builder.build_engine(network, config)


[06/14/2025-09:35:05] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[06/14/2025-09:35:06] [TRT] [I] Total Host Persistent Memory: 270848
[06/14/2025-09:35:06] [TRT] [I] Total Device Persistent Memory: 155136
[06/14/2025-09:35:06] [TRT] [I] Total Scratch Memory: 2753536
[06/14/2025-09:35:06] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 138 MiB, GPU 120 MiB
[06/14/2025-09:35:06] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 67 steps to complete.
[06/14/2025-09:35:06] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 0.792508ms to assign 4 blocks to 67 nodes requiring 61014016 bytes.
[06/14/2025-09:35:06] [TRT] [I] Total Activation Memory: 61014016
[06/14/2025-09:35:06] [TRT] [W] TensorRT encountered issues when converting weights between types and that could affect accuracy.
[06/14/2025-09:35:06] [TRT] [W] If this is not the desired behavior, please modify the weights or retrain with regularization to ad

In [15]:
# ==== STEP 7: Save engine to file ==== #
with open(ENGINE_PATH, 'wb') as f:
    f.write(engine.serialize())
print(f"✅ Engine đã lưu: {ENGINE_PATH}")


✅ Engine đã lưu: resnet50.engine


# So sánh kết quả của onnx và engine

In [20]:
import onnxruntime as ort
import numpy as np
import time

# Load ONNX model
session = ort.InferenceSession("resnet50-v1-12.onnx", providers=["CUDAExecutionProvider"])

# Tạo input giả
input_name = session.get_inputs()[0].name
dummy_input = np.random.rand(1,3,224,224).astype(np.float32)

# Warmup
for _ in range(10):
    session.run(None, {input_name: dummy_input})

# Benchmark
start = time.time()
for _ in range(100):
    session.run(None, {input_name: dummy_input})
end = time.time()

latency_onnx = (end - start) / 100 * 1000
print(f"ONNX Latency: {latency_onnx:.3f} ms")


ONNX Latency: 4.421 ms
