In [1]:
from ultralytics import YOLO
import os
import torch
import tensorrt as trt

# torch quantization

In [2]:
import pickle
from torch.ao.quantization import quantize_dynamic

model_to_quant = torch.load("models/best.pt")
model_to_quant['model'] = quantize_dynamic(model_to_quant['model'].eval(), qconfig_spec=None, dtype=torch.quint8, mapping=None, inplace=False)

torch.save(model_to_quant, 'quant_models/best_int8.pt', pickle_module=pickle)

    import torch
    ckpt = torch.load("model.pt")  # applies to both official and custom models
    torch.save(ckpt, "updated-model.pt")



In [3]:
torch_model = YOLO('quant_models/best_int8.pt')

## model size

In [4]:
print('size : ' , round(os.path.getsize('quant_models/best_int8.pt')/1024) , 'KB')

size :  6097 KB


## architecture

In [5]:
torch_model.model

DetectionModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (2): C2f(
      (cv1): Conv(
        (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
    

## parameters

In [6]:
for i in torch_model.model.fuse().parameters():
    print('first layer parameters \n\n ' , i.data)
    break

Model summary (fused): 168 layers, 3005843 parameters, 0 gradients, 8.1 GFLOPs


first layer parameters 

  tensor([[[[-1.7560e+00, -2.7517e-01,  1.5227e+00],
          [-5.8778e+00,  4.7612e-01,  6.0757e+00],
          [-5.3064e+00, -2.7330e-01,  4.9405e+00]],

         [[-1.9997e+00,  1.6081e-01,  2.2592e+00],
          [-7.5843e+00,  4.6305e-01,  7.1511e+00],
          [-5.7359e+00, -9.6099e-02,  5.3848e+00]],

         [[-7.6133e-01, -3.8930e-01,  1.7014e-01],
          [-3.5140e+00,  3.8510e-01,  2.9463e+00],
          [-2.4908e+00,  3.1998e-01,  2.3619e+00]]],


        [[[ 1.7665e+00,  1.6209e+00,  2.1402e+00],
          [ 1.8290e-01, -2.4301e+00,  6.4909e-01],
          [-2.3331e+00, -1.0133e+01, -1.3649e+00]],

         [[-5.5658e-01, -5.4293e-01,  4.7529e-01],
          [-2.4295e-01, -3.9043e+00, -7.0369e-01],
          [-1.2484e+00, -1.1259e+01, -1.4729e+00]],

         [[ 7.2310e-01, -1.7649e-02,  3.8945e-01],
          [ 1.3176e+00, -1.3491e+00,  3.0392e-01],
          [ 1.4353e+00, -4.9501e+00,  3.9218e-01]]],


        [[[-9.3692e-02, -1.1119e+00, -4

# onnx quantization

In [7]:
from onnxruntime.quantization import quantize_dynamic, QuantType

model_onnx = 'models/best.onnx'
onnx_quant = 'quant_models/best.quant.onnx'

quantized_model = quantize_dynamic(model_onnx, onnx_quant)

In [8]:
import onnx

onnx_model = onnx.load('quant_models/best.quant.onnx')

## model size

In [9]:
print('size : ' , round(os.path.getsize('quant_models/best.quant.onnx')/1024) , 'KB')

size :  3197 KB


## architecture

In [10]:
onnx_model.graph.node

[input: "images"
output: "images_quantized"
output: "images_scale"
output: "images_zero_point"
name: "images_QuantizeLinear"
op_type: "DynamicQuantizeLinear"
, input: "model.0.conv.bias"
input: "/model.0/conv/Conv_output_0_bias_reshape_shape"
output: "/model.0/conv/Conv_output_0_bias_reshape_output"
op_type: "Reshape"
, input: "model.1.conv.bias"
input: "/model.1/conv/Conv_output_0_bias_reshape_shape"
output: "/model.1/conv/Conv_output_0_bias_reshape_output"
op_type: "Reshape"
, input: "model.12.cv1.conv.bias"
input: "/model.12/cv1/conv/Conv_output_0_bias_reshape_shape"
output: "/model.12/cv1/conv/Conv_output_0_bias_reshape_output"
op_type: "Reshape"
, input: "model.12.cv2.conv.bias"
input: "/model.12/cv2/conv/Conv_output_0_bias_reshape_shape"
output: "/model.12/cv2/conv/Conv_output_0_bias_reshape_output"
op_type: "Reshape"
, input: "model.12.m.0.cv1.conv.bias"
input: "/model.12/m.0/cv1/conv/Conv_output_0_bias_reshape_shape"
output: "/model.12/m.0/cv1/conv/Conv_output_0_bias_reshape_ou

## parameters

In [11]:
print(onnx_model.graph.initializer[0].name)
print([ int(i) for i in onnx_model.graph.initializer[0].raw_data])

model.0.conv.bias
[139, 117, 248, 63, 180, 95, 36, 64, 42, 27, 200, 63, 80, 218, 232, 63, 93, 76, 209, 64, 227, 98, 150, 63, 108, 202, 108, 64, 6, 62, 51, 64, 31, 91, 252, 63, 235, 137, 169, 63, 80, 141, 1, 64, 192, 200, 22, 64, 11, 147, 14, 191, 39, 198, 229, 63, 105, 40, 34, 192, 141, 184, 198, 63]


In [12]:
print(onnx_model.graph.initializer[85].name)
print([int(i) for i in onnx_model.graph.initializer[85].raw_data])

model.1.conv.weight_quantized
[249, 244, 232, 250, 5, 34, 4, 3, 245, 249, 0, 16, 12, 4, 244, 252, 250, 253, 3, 1, 252, 247, 245, 2, 4, 7, 1, 242, 4, 19, 13, 18, 216, 243, 240, 17, 0, 234, 24, 1, 18, 226, 253, 253, 11, 2, 255, 1, 255, 249, 0, 0, 3, 3, 250, 14, 252, 2, 226, 22, 3, 17, 236, 253, 247, 1, 6, 14, 10, 1, 252, 249, 14, 251, 230, 234, 234, 43, 11, 13, 246, 252, 2, 1, 0, 255, 254, 5, 2, 1, 242, 242, 6, 14, 27, 1, 0, 243, 242, 5, 220, 247, 3, 74, 23, 245, 226, 0, 249, 7, 3, 17, 232, 4, 2, 15, 1, 253, 9, 21, 3, 255, 228, 250, 252, 5, 255, 14, 10, 254, 254, 255, 251, 250, 246, 0, 2, 251, 0, 253, 2, 0, 0, 0, 4, 17, 7, 28, 255, 245, 236, 225, 254, 24, 243, 239, 248, 233, 0, 247, 15, 15, 246, 1, 4, 252, 12, 0, 5, 251, 246, 250, 250, 253, 219, 5, 2, 4, 34, 255, 14, 252, 11, 23, 242, 240, 212, 35, 3, 251, 1, 3, 250, 3, 2, 4, 0, 249, 1, 254, 251, 251, 251, 5, 13, 247, 3, 11, 2, 252, 6, 239, 248, 234, 7, 23, 0, 255, 3, 19, 255, 248, 4, 219, 6, 254, 1, 254, 253, 1, 2, 3, 2, 251, 9, 2, 5, 2

# TensorRt quantization

In [13]:
from TensorRT import engine
device = 'cuda' if torch.cuda.is_available() else 'cpu'
engine_build = engine.EngineBuilder('models/best.onnx' , device)
engine_build.seg = True
engine_build.build(fp16=True,
              input_shape=[1, 3, 640, 640],
              iou_thres=0.65,
              conf_thres=0.25,
              topk=100)

[09/27/2023-17:24:16] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[09/27/2023-17:24:16] [TRT] [W] input "images" with shape: (1, 3, 640, 640) dtype: DataType.FLOAT
[09/27/2023-17:24:16] [TRT] [W] output "output0" with shape: (1, 5, 8400) dtype: DataType.FLOAT
[09/27/2023-17:28:48] [TRT] [W] TensorRT encountered issues when converting weights between types and that could affect accuracy.
[09/27/2023-17:28:48] [TRT] [W] If this is not the desired behavior, please modify the weights or retrain with regularization to adjust the magnitude of the weights.
[09/27/2023-17:28:48] [TRT] [W] Check verbose logs for the list of affected weights.
[09/27/2023-17:28:48] [TRT] [W] - 55 weights are affected by this issue: Detected subnormal FP16 values.
[09/27/2023-17:28:48] [TRT] [W] - 2 weights are affected by this issue: Detected values less than smallest positive FP16 subnor

## model size

In [14]:
print('size : ' , round(os.path.getsize('models/best.engine')/1024) , 'KB')

size :  8518 KB


In [15]:
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
runtime = trt.Runtime(TRT_LOGGER)
with open('models/best.engine', 'rb') as f:
    engine_bytes = f.read()
    engine = runtime.deserialize_cuda_engine(engine_bytes)
context = engine.create_execution_context()

[09/27/2023-17:28:48] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
[09/27/2023-17:28:48] [TRT] [I] Loaded engine size: 8 MiB
[09/27/2023-17:28:48] [TRT] [V] Deserialization required 22725 microseconds.
[09/27/2023-17:28:48] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +5, now: CPU 0, GPU 5 (MiB)
[09/27/2023-17:28:48] [TRT] [V] Total per-runner device persistent memory is 0
[09/27/2023-17:28:48] [TRT] [V] Total per-runner host persistent memory is 359488
[09/27/2023-17:28:48] [TRT] [V] Allocated activation device memory of size 10035200
[09/27/2023-17:28:48] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +10, now: CPU 0, GPU 15 (MiB)
[09/27/2023-17:28:48] [TRT] [V] CUDA lazy loading is enabled.


In [16]:
names = [engine.get_binding_name(i) for i in range(engine.num_bindings)]
names

  names = [engine.get_binding_name(i) for i in range(engine.num_bindings)]


['images', 'output0']

In [17]:
print('input shape : ' , tuple(engine.get_binding_shape(0)))
print('output shape : ' , tuple(engine.get_binding_shape(1)) )

input shape :  (1, 3, 640, 640)
output shape :  (1, 5, 8400)


  print('input shape : ' , tuple(engine.get_binding_shape(0)))
  print('output shape : ' , tuple(engine.get_binding_shape(1)) )


# Tensorflow quantization

In [18]:
import tensorflow as tf

In [19]:
cifar10 = tf.keras.models.load_model('models/cifar10.h5')

In [20]:
import tensorflow_model_optimization as tfmot
quantize_model = tfmot.quantization.keras.quantize_model
# q_aware stands for for quantization aware.
q_aware_model = quantize_model(cifar10)

# `quantize_model` requires a recompile.
q_aware_model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
q_aware_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 quantize_layer (QuantizeLa  (None, 32, 32, 3)         3         
 yer)                                                            
                                                                 
 quant_conv2d (QuantizeWrap  (None, 32, 32, 32)        963       
 perV2)                                                          
                                                                 
 quant_max_pooling2d (Quant  (None, 16, 16, 32)        1         
 izeWrapperV2)                                                   
                                                                 
 quant_conv2d_1 (QuantizeWr  (None, 16, 16, 64)        18627     
 apperV2)                                                        
                                                                 
 quant_max_pooling2d_1 (Qua  (None, 8, 8, 64)          1

In [21]:
q_aware_model.save('quant_models/quant_cifar10.h5')

  saving_api.save_model(


## model size

In [22]:
print('cifar10 size : ' , round(os.path.getsize('models/cifar10.h5')/1024) , 'KB')
print('quant_cifar10 size : ' , round(os.path.getsize('quant_models/quant_cifar10.h5')/1024) , 'KB')

cifar10 size :  6431 KB
quant_cifar10 size :  2179 KB


In [23]:
q_aware_model.get_weights()

[0.0,
 0.0,
 -1,
 array([[[[   -0.10984,     0.16286,     0.10583,      0.1344,    0.052876,   -0.059854,      0.1234,   -0.048416,     0.14024,    -0.12499,   -0.087416,    -0.01179,   -0.060344,    -0.12102,     0.11035,   -0.054835,     0.10827,  -0.0024675,    0.083012,    -0.18948,    -0.15694,   -0.020592,    -0.17448,
             0.0057317,  0.00012089,   -0.019303,   -0.044793,    -0.09317,    -0.17016,    -0.14144,    0.040612,   -0.027489],
          [  -0.029951,  -0.0088726,     -0.0956,     0.14231,    0.072619,      0.1785,     0.14812,     0.13061,   -0.041349,    0.075376,    0.049858,    -0.19289,   -0.096654,    -0.10499,  -0.0057764,    -0.10438,     0.15601,    -0.13745,    0.077967,    -0.14709,    -0.18169,    0.089332,    -0.17634,
               0.14177,   -0.055198,     0.10172,     0.15777,    0.061197,   -0.081449, -0.00063092,    -0.11247,    0.019658],
          [   -0.14082,     0.15007,   -0.094471,    0.081759,     0.03093,    0.025521,   -0.062831,   -