# Model Deployment for MNIST
- Training Process
    - DataSet Preparation
    - Data Preprocessing
    - Model Construction
    - Traning
    - Testing
    - Model Saving & Loading
- Deployment Process
    - to ONNX
    - Inference Engine
        - ONNX Runtime
        - TensorRT running

## Tranning Process

### Dataset Preparation
- MNIST
- Preprocessing -> transforms.Compose()
- DataLoader

In [4]:
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Define a transform to normalize the data
data_tf = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
)

# Download and load the training data
trainset = datasets.MNIST(
    root = './data', 
    train = True,
    transform = data_tf,
    download = True
)
train_loader = DataLoader(trainset, batch_size=128, shuffle=True)

testset = datasets.MNIST(
    root='./data', 
    train = False,
    transform = data_tf
)
test_loader = DataLoader(testset, batch_size=128, shuffle=False)


In [7]:
for data in train_loader:
    img, label = data
    print(img.size())
    break

torch.Size([128, 1, 28, 28])


### Model Construction
- N = (Width - Kernel_size + 2*Padding) / Stride + 1

In [5]:
import torch
from torch import nn

In [6]:
# Define model structure
class ConvBlock(nn.Module):
    def __init__(self, in_dim, out_dim, stride=1, padding=1):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(in_dim, out_dim, kernel_size=3, stride=stride, padding=padding)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        x = self.maxpool(x)
        return x

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # 1*1*28*28
        self.convblock1 = ConvBlock(1, 32)
        # 1*32*14*14
        self.convblock2 = ConvBlock(32, 64)
        # 1*64*7*7
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64*7*7, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, img):
        img = self.convblock1(img)
        img = self.convblock2(img)
        img = self.flatten(img)
        img = self.fc1(img)
        img = self.relu(img)
        img = self.fc2(img)

        return img

In [10]:
model = MyModel()

if torch.cuda.is_available():
    model = model.cuda()

In [11]:
x = trainset[0][0].view(1,1,28,28).cuda()
model(x)

tensor([[ 0.1359, -0.0311, -0.0957,  0.0146,  0.1360, -0.0667,  0.0667, -0.1169,
          0.0029, -0.1248]], device='cuda:0', grad_fn=<AddmmBackward0>)

### Training
- Loss
- Optimizer

In [12]:
from torch import optim

Loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [13]:
num_epoches = 100

for epoch in range(num_epoches):
    for batch_idx, (imgs, labels) in enumerate(train_loader):
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()

        # Forward Process
        outputs = model(imgs)
        loss = Loss(outputs, labels)

        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print('epoch: {}, loss: {}'.format(epoch, loss.item()))


epoch: 0, loss: 0.11541032046079636
epoch: 1, loss: 0.0780627653002739
epoch: 2, loss: 0.028133006766438484
epoch: 3, loss: 0.08547574281692505
epoch: 4, loss: 0.013079293072223663
epoch: 5, loss: 0.015158281661570072
epoch: 6, loss: 0.02131212316453457
epoch: 7, loss: 0.03608933463692665
epoch: 8, loss: 0.008750793524086475
epoch: 9, loss: 0.0016039339825510979
epoch: 10, loss: 0.0023217021953314543
epoch: 11, loss: 0.0014150800416246057
epoch: 12, loss: 0.028946593403816223
epoch: 13, loss: 4.979946606908925e-05
epoch: 14, loss: 5.748984676756663e-06
epoch: 15, loss: 0.0003072724211961031
epoch: 16, loss: 0.007224217057228088
epoch: 17, loss: 0.0005629609222523868
epoch: 18, loss: 0.0013020599726587534
epoch: 19, loss: 0.002345251850783825
epoch: 20, loss: 0.00010087257396662608
epoch: 21, loss: 4.92084764118772e-05
epoch: 22, loss: 0.0004791969258803874
epoch: 23, loss: 1.950174555531703e-05
epoch: 24, loss: 2.3306007278733887e-05
epoch: 25, loss: 5.02959119330626e-05
epoch: 26, los

### Testing

In [14]:
with torch.no_grad():
    total = 0
    correct = 0
    for imgs, labels in test_loader:
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        
        outputs = model(imgs)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print('accuracy: {}'.format(accuracy))

accuracy: 99.29


### Model Saving

In [15]:
torch.save(model.state_dict(), 'model_ckpt.pt')

### Model Loading

In [8]:
# Create an instance of your model
model = MyModel()

# Load the trained model parameters
model.load_state_dict(torch.load('model_ckpt.pt'))
model = model.cuda()

In [10]:
model.eval()
opt = model(trainset[0][0].view(1,1,28,28).cuda())
_, predicted = torch.max(opt, dim=1)
print(predicted.item())

5


## Deployment Process

### Model Conversion
- torch to onnx
- onnx to engine

#### Pytorch To ONNX Model
- to view at netron.app

In [1]:
import torch
from torch import nn

# Define model structure
class ConvBlock(nn.Module):
    def __init__(self, in_dim, out_dim, stride=1, padding=1):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(in_dim, out_dim, kernel_size=3, stride=stride, padding=padding)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        x = self.maxpool(x)
        return x

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # 1*1*28*28
        self.convblock1 = ConvBlock(1, 32)
        # 1*32*14*14
        self.convblock2 = ConvBlock(32, 64)
        # 1*64*7*7
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64*7*7, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, img):
        img = self.convblock1(img)
        img = self.convblock2(img)
        img = self.flatten(img)
        img = self.fc1(img)
        img = self.relu(img)
        img = self.fc2(img)

        return img

# Create an instance of your model
model = MyModel()

# Load the trained model parameters
model.load_state_dict(torch.load('model_ckpt.pt'))
model = model.cuda()

model.eval()  # make model into evaluation mode

MyModel(
  (convblock1): ConvBlock(
    (conv): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu): ReLU()
    (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (convblock2): ConvBlock(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu): ReLU()
    (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)

In [2]:
# Make model into onnx model

x = torch.rand(size=(1,1,28,28), dtype=torch.float32).cuda()
torch.onnx.export(  # export onnx model / you can open .onnx file with netron.app
    model,
    x,
    f='my_model.onnx', 
    input_names=['input'],
    output_names=['output'],
    opset_version=11
)

verbose: False, log level: Level.ERROR



#### ONNX Model to Engine
- Here is a problem, there is an error if I want to set_shape : input is static

In [5]:
import tensorrt as trt

In [4]:
write_engine = True
onnx_file_path = 'my_model.onnx'
engine_file_path = 'engine.trt'
bUseFP16Mode = True  # If this mode then only set flag
bUseINT8Mode = False  # If this mode then need calibrator

In [16]:
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(TRT_LOGGER)

# Create network and make it into explicit mode
explicit_mode = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(explicit_mode)

# If the network has any dynamic input tensors, this call must be made.
profile = builder.create_optimization_profile()

# Config that how to create engine by builder
config = builder.create_builder_config()

# INT8/ FP16 / FP32
if bUseFP16Mode:
    config.set_flag(trt.BuilderFlag.FP16)
if bUseINT8Mode:
    config.set_flag(trt.BuilderFlag.INT8)
    #  config.int8_calibrator = calibrator.MyCalibrator()

# Create ONNX parser
parser = trt.OnnxParser(network, TRT_LOGGER)

with open(onnx_file_path, 'rb') as model:
    if not parser.parse(model.read()):
        print('Failed\n')
        for error in range(parser.num_errors):
            print(parser.get_error(error))
        exit()

# inputTensor = network.get_input(0)
# profile.set_shape(inputTensor.name, [1,1,28,28],[1,1,28,28],[4,1,28,28])
config.add_optimization_profile(profile)
# network.unmark_output(network.get_output(0))

engine = builder.build_serialized_network(network, config)
if write_engine:
    with open(engine_file_path, 'wb') as f:
        f.write(engine)



### Inference

In [1]:
import tensorrt as trt

TRT_LOGGER = trt.Logger()

with open('engine.trt', 'rb') as f:
    runtime = trt.Runtime(TRT_LOGGER)
    engine = runtime.deserialize_cuda_engine(f.read())

In [12]:
nIO = engine.num_io_tensors
lTensorName = [engine.get_tensor_name(i) for i in range(nIO)]
nInput = [engine.get_tensor_mode(lTensorName[i]) for i in range(nIO)].count(trt.TensorIOMode.INPUT)

print(nIO)
print(lTensorName)
print(nInput)

2
['input', 'output']
1


In [13]:
context = engine.create_execution_context()
context.set_input_shape(lTensorName[0], [1, 1, 28, 28])

[06/12/2023-19:10:26] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading


True

#### DataPreprocessing

In [14]:
# Achieved by PIL
import torchvision.transforms as transforms
from PIL import Image

transform = transforms.Compose(
    [transforms.Grayscale(),
     transforms.Resize((28, 28)),
     transforms.ToTensor(),]
)

def preprocess(img_path):
    img = Image.open(img_path).convert('L')
    tensor_image = transform(img)
    return tensor_image.view([1, 1, 28, 28])

img_in = preprocess('./data/Test/3.jpg')
print(img_in.dtype)
print(img_in.shape)
img_in = img_in.numpy()

torch.float32
torch.Size([1, 1, 28, 28])


In [None]:
# Achieved by cv2

#### buffer management

In [15]:
import numpy as np
from cuda import cudart

bufferH = []
bufferH.append(np.ascontiguousarray(img_in))
for i in range(nInput, nIO):
    bufferH.append(np.empty(context.get_tensor_shape(lTensorName[i]), dtype=trt.nptype(engine.get_tensor_dtype(lTensorName[i]))))
bufferD = []
for i in range(nIO):
    bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

for i in range(nInput):
    cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

for i in range(nIO):
    context.set_tensor_address(lTensorName[i], int(bufferD[i]))

context.execute_async_v3(0)
for i in range(nInput, nIO):
    cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

for i in range(nIO):
    print(lTensorName[i])
    print(bufferH[i])

for b in bufferD:
    cudart.cudaFree(b)

print("Succeeded running model in TensorRT!")

input
[[[[1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.        ]
   [1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.        ]
   [1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.        ]
   [1.         1.         1.         1.         1.         1.
    1.         1.         1.         1.  

In [23]:
np.argmax(max(bufferH[i]))


3