In [1]:
import os
import time

import cv2
import onnx
import torch
import numpy as np
import openvino as ov
import onnxruntime as ort
import matplotlib.pyplot as plt 

from typing import List, Tuple
from tqdm.notebook import tqdm

from torchmetrics.detection import MeanAveragePrecision
from transformers import DetrImageProcessor, DetrForObjectDetection

import utils

Загрузка данных

In [2]:
image_sets_path = 'D:\\datasets\\VOC2012\\ImageSets\\Main\\'
train_path = os.path.join(image_sets_path, 'train.txt')
val_path = os.path.join(image_sets_path, 'val.txt')

images_path = 'D:\\datasets\\VOC2012\\JPEGImages\\'
annots_path = 'D:\\datasets\\VOC2012\\Annotations\\'

In [3]:
train_images = utils.read_set(train_path)
val_images = utils.read_set(val_path)

set(train_images) & set(val_images)

print(f'train: {len(train_images)}')
print(f'val: {len(val_images)}')

train: 5717
val: 5823


Загрузка базовой модели PyTorch

In [4]:
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Ber

In [5]:
res_map, mean_sec_per_frame = utils.evaluate_over_voc(
    images_path=images_path,
    annots_path=annots_path,
    val_images=val_images,
    model=model,
    proc=processor
)

print('MAP: ', res_map)
print('SEC PER FRAME: ', mean_sec_per_frame)

  0%|          | 0/5823 [00:00<?, ?it/s]

MAP:  {'map': tensor(0.5615), 'map_50': tensor(0.7291), 'map_75': tensor(0.6085), 'map_small': tensor(0.1909), 'map_medium': tensor(0.4111), 'map_large': tensor(0.6619), 'mar_1': tensor(0.4643), 'mar_10': tensor(0.6253), 'mar_100': tensor(0.6278), 'mar_small': tensor(0.2524), 'mar_medium': tensor(0.4810), 'mar_large': tensor(0.7129), 'map_per_class': tensor(-1.), 'mar_100_per_class': tensor(-1.), 'classes': tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19], dtype=torch.int32)}
SEC PER FRAME:  0.014


Конвертация модели в ONNX

In [9]:
model.eval()
batch_size = 1

#check if model is fine
input = torch.randn(batch_size, 3, 800, 1137, requires_grad=True)
# torch_out = model(input)
# print(torch_out)

torch.onnx.export(model, input, "detr-resnet-50.onnx",
                  export_params=True, opset_version=11,
                  do_constant_folding=True,
                  input_names = ["pixel_values", "pixel_mask"],
                  output_names = ['output']
                 )

  if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
  if attention_mask.size() != (batch_size, 1, target_len, source_len):
  if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):


In [5]:
onnx_model = onnx.load("detr-resnet-50.onnx")
onnx.checker.check_model(onnx_model)

Тестирование модели ONNX

In [6]:
print(ort.get_device())

GPU


In [8]:
ort_session = ort.InferenceSession('detr-resnet-50.onnx', providers=['CUDAExecutionProvider']) # providers=['CUDAExecutionProvider']

In [10]:
res_map, mean_sec_per_frame = utils.evaluate_over_voc_onnx(
    images_path=images_path,
    annots_path=annots_path,
    val_images=val_images,
    model=ort_session,
    torch_model=model,
    proc=processor,
)

print('MAP: ', res_map)
print('SEC PER FRAME: ', mean_sec_per_frame)

  0%|          | 0/5823 [00:00<?, ?it/s]

MAP:  {'map': tensor(0.5399), 'map_50': tensor(0.7018), 'map_75': tensor(0.5839), 'map_small': tensor(0.1740), 'map_medium': tensor(0.4010), 'map_large': tensor(0.6355), 'mar_1': tensor(0.4496), 'mar_10': tensor(0.6052), 'mar_100': tensor(0.6075), 'mar_small': tensor(0.2333), 'mar_medium': tensor(0.4679), 'mar_large': tensor(0.6890), 'map_per_class': tensor(-1.), 'mar_100_per_class': tensor(-1.), 'classes': tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19], dtype=torch.int32)}
SEC PER FRAME:  0.036


Конвертация PyTorch модели в OpenVINO

In [None]:
# Не удачно...

ov_model = ov.convert_model(model)
ov.save_model(ov_model, "detr-resnet-50-from-torch-static.xml")

Конвертация ONNX модели в OpenVINO

In [11]:
ov_model = ov.convert_model("detr-resnet-50.onnx")
ov.save_model(ov_model, "detr-resnet-50.xml")

Тестирование модели OpenVINO

In [19]:
core = ov.Core()

devices = core.available_devices
for device in devices:
    device_name = core.get_property(device, "FULL_DEVICE_NAME")
    print(f"{device}: {device_name}")

CPU: 13th Gen Intel(R) Core(TM) i7-13700KF
GPU: NVIDIA GeForce RTX 4060 Ti (dGPU)


In [27]:
model_ir = core.read_model(model="detr-resnet-50.xml")
compiled_model = core.compile_model(model=model_ir, device_name='GPU')

In [21]:
res_map, mean_sec_per_frame = utils.evaluate_over_voc_ov(
    images_path=images_path,
    annots_path=annots_path,
    val_images=val_images,
    model=compiled_model,
    torch_model=model,
    proc=processor,
)

print('MAP: ', res_map)
print('SEC PER FRAME: ', mean_sec_per_frame)

  0%|          | 0/5823 [00:00<?, ?it/s]

MAP:  {'map': tensor(0.5385), 'map_50': tensor(0.7016), 'map_75': tensor(0.5823), 'map_small': tensor(0.1722), 'map_medium': tensor(0.3995), 'map_large': tensor(0.6347), 'mar_1': tensor(0.4490), 'mar_10': tensor(0.6041), 'mar_100': tensor(0.6064), 'mar_small': tensor(0.2310), 'mar_medium': tensor(0.4655), 'mar_large': tensor(0.6884), 'map_per_class': tensor(-1.), 'mar_100_per_class': tensor(-1.), 'classes': tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19], dtype=torch.int32)}
SEC PER FRAME:  0.506
