In [1]:
import os
import time

import cv2
import onnx
import torch
import numpy as np
import openvino as ov
import onnxruntime as ort
import matplotlib.pyplot as plt 

from typing import List, Tuple
from tqdm.notebook import tqdm

from torchmetrics.detection import MeanAveragePrecision
from transformers import AutoImageProcessor, AutoModelForObjectDetection
from transformers import DetrImageProcessor, DetrForObjectDetection

import my_utils

In [2]:
image_sets_path = 'D:\\datasets\\VOC2012\\ImageSets\\Main\\'
train_path = os.path.join(image_sets_path, 'train.txt')
val_path = os.path.join(image_sets_path, 'val.txt')

images_path = 'D:\\datasets\\VOC2012\\JPEGImages\\'
annots_path = 'D:\\datasets\\VOC2012\\Annotations\\'

In [3]:
train_images = my_utils.read_set(train_path)
val_images = my_utils.read_set(val_path)

set(train_images) & set(val_images)

print(f'train: {len(train_images)}')
print(f'val: {len(val_images)}')

train: 5717
val: 5823


In [4]:
# processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
# model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

processor = AutoImageProcessor.from_pretrained("hustvl/yolos-small")
model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-small")

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [6]:
res_map, mean_sec_per_frame = my_utils.evaluate_over_voc(
    images_path=images_path,
    annots_path=annots_path,
    val_images=val_images[:100],
    model=model,
    proc=processor,
)

print('MAP: ', res_map)
print('SEC PER FRAME: ', mean_sec_per_frame)

  0%|          | 0/100 [00:00<?, ?it/s]

MAP:  {'map': tensor(0.5079), 'map_50': tensor(0.6488), 'map_75': tensor(0.5570), 'map_small': tensor(0.3027), 'map_medium': tensor(0.3172), 'map_large': tensor(0.6976), 'mar_1': tensor(0.4670), 'mar_10': tensor(0.5583), 'mar_100': tensor(0.5583), 'mar_small': tensor(0.3676), 'mar_medium': tensor(0.3440), 'mar_large': tensor(0.7267), 'map_per_class': tensor(-1.), 'mar_100_per_class': tensor(-1.), 'classes': tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19], dtype=torch.int32)}
SEC PER FRAME:  1.475


In [8]:
from optimum.bettertransformer import BetterTransformer

In [9]:
model = BetterTransformer.transform(model)

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [10]:
res_map, mean_sec_per_frame = my_utils.evaluate_over_voc(
    images_path=images_path,
    annots_path=annots_path,
    val_images=val_images[:100],
    model=model,
    proc=processor,
)

print('MAP: ', res_map)
print('SEC PER FRAME: ', mean_sec_per_frame)

  0%|          | 0/100 [00:00<?, ?it/s]

MAP:  {'map': tensor(0.5079), 'map_50': tensor(0.6488), 'map_75': tensor(0.5570), 'map_small': tensor(0.3027), 'map_medium': tensor(0.3172), 'map_large': tensor(0.6976), 'mar_1': tensor(0.4670), 'mar_10': tensor(0.5583), 'mar_100': tensor(0.5583), 'mar_small': tensor(0.3676), 'mar_medium': tensor(0.3440), 'mar_large': tensor(0.7267), 'map_per_class': tensor(-1.), 'mar_100_per_class': tensor(-1.), 'classes': tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19], dtype=torch.int32)}
SEC PER FRAME:  1.098
