In [24]:
# !pip install ultralyticsplus 

### Imports

In [1]:
import os
import requests
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import torch
import torchvision.transforms as T
from PIL import Image, ImageDraw, ImageFont

from PIL import Image, ImageDraw
from ultralytics import YOLO
from ultralyticsplus import render_result


device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [2]:
from object_detection_utils import (
    COLORS,
    preprocess, 
    box_cxcywh_to_xyxy, 
    rescale_bboxes,
    batch_detect,
    detect, 
    plot_results,
    plot_batch_detections,
    plot_batch_detections,
    plot_results_avenue,
    load_images_from_folder,
    list_image_files,
    save_cropped_images,
)

### Load Model

In [3]:
# load model
model = YOLO('./pretrained/yolov8x')

In [4]:
model = model.to(device)

In [5]:
# set model parameters
model.overrides['conf'] = 0.6  # confidence threshold
model.overrides['iou'] = 0.45  # NMS IoU threshold
model.overrides['agnostic_nms'] = False  # NMS class-agnostic
model.overrides['max_det'] = 20  # maximum number of detections per image

AttributeError: 'NoneType' object has no attribute 'overrides'

### Test on 1 Img

In [None]:
transform = T.Compose([
    T.Resize((500, 500)),  # Assuming 800 is the input size for the model
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
image_tensor, image = preprocess("./assets/demo6.jpg", transform=transform)

In [None]:
# %%timeit -n 100
# # benchmark
# with torch.no_grad():
#     # Run the model on the batch
#     outputs = model(pixel_values=batch_images)

In [None]:
results = model(image)

In [None]:
# observe results
print(results[0].boxes.xyxy.int().tolist())
render = render_result(model=model, image=image, result=results[0])
render.show()

### Avenue Prediction using YOLOv8

In [None]:
train_dir = "./datasets/Avenue Dataset/train__/"
train_video_dirs = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16"]
train_video_paths = []
total_frames = 0
for dir in train_video_dirs:
    cur_dir = os.path.join(train_dir, dir)
    train_video_paths.append(cur_dir)
    jpg_files = [f for f in os.listdir(cur_dir) if f.endswith('.jpg')]
    total_frames += len(jpg_files)
    print(cur_dir, len(jpg_files))
print(f"Total Frames: {total_frames}")

In [None]:
test_dir = "./datasets/Avenue Dataset/test__/"
test_video_dirs = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"]
test_video_paths = []
total_frames = 0
for dir in test_video_dirs:
    cur_dir = os.path.join(test_dir, dir)
    test_video_paths.append(cur_dir)
    jpg_files = [f for f in os.listdir(cur_dir) if f.endswith('.jpg')]
    total_frames += len(jpg_files)
    print(cur_dir, len(jpg_files))
print(f"Total Frames: {total_frames}")

In [None]:
preds_path = "./datasets/Avenue Dataset/predictions/"
train_preds_path = os.path.join(preds_path, "train/")
test_preds_path = os.path.join(preds_path, "test/")
print(train_preds_path, test_preds_path)

In [None]:
objects_path = "./datasets/Avenue Dataset/objects/"
train_objects_path = os.path.join(objects_path, "train/")
test_objects_path = os.path.join(objects_path, "test/")
print(train_objects_path, test_objects_path)

In [None]:
image_names = [img.split('/')[-1] for img in list_image_files(test_video_paths[-1])]
print(image_names)

In [None]:
def plot_results_avenue(pil_img, prob, boxes, im_size=(640, 360), display_img=True, save_path=None, crop_objects=False):
    orig_width, orig_height = pil_img.size
    scale_x = orig_width / im_size[0]
    scale_y = orig_height / im_size[1]

    # Create a copy of the image for cropping
    pil_img_copy = pil_img.copy()

    draw = ImageDraw.Draw(pil_img)
    font = ImageFont.load_default()  # Can be changed to another font

    cropped_images = []  # List to hold cropped objects

    for (xmin, ymin, xmax, ymax), c in zip(boxes, COLORS * 100):
        xmin, xmax = xmin * scale_x, xmax * scale_x
        ymin, ymax = ymin * scale_y, ymax * scale_y

        color = tuple(int(255 * x) for x in c)
        draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=3)

        if crop_objects:
            # Cropping from the unaltered image copy
            cropped_obj = pil_img_copy.crop((xmin, ymin, xmax, ymax))
            cropped_images.append(cropped_obj)

        # Optionally add class text
        # cl = p.argmax()
        # text = f'{CLASSES[cl]}: {p[cl]:0.2f}'
        # draw.text((xmin, ymin), text, fill=color, font=font)

    if display_img:
        plt.figure(figsize=(16,10))
        plt.imshow(pil_img)
        plt.axis('off')
        plt.show()

    if save_path:
        pil_img.save(save_path)

    return pil_img, cropped_images

### Generate Train Objects and predictions

In [62]:
%%capture
transform = T.Compose([
    T.Resize((640, 360)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
bbox_temp = {}
video_names = [video.split('/')[-1] for video in train_video_paths]
print(video_names)
# obj_det_dict = {}

for i, video_path in enumerate(train_video_paths):
    print(video_path)
    image_names = [img.split('/')[-1] for img in list_image_files(train_video_paths[i])]

    frame_idx = 0
    for image_name in image_names:
        full_image_path = os.path.join(train_video_paths[i], image_name)
        
        image_tensor, image = preprocess(full_image_path, transform=transform)
        
        # probab, boxes = detect(image, model)
        with torch.no_grad():
            results = model(image)

        # # Process boxes and convert to Python lists with integers
        # boxes_list = []
        # for box in boxes:
        #     int_box = [int(elem.item()) for elem in box]  # Convert each element to an int and then to a Python list
        #     boxes_list.append(int_box)

        boxes_list = results[0].boxes.xyxy.int().tolist()

        # TODO: Save this dict
        bbox_temp[frame_idx] = boxes_list

        frame_idx += 1
    
        image_prediction_path = os.path.join(train_preds_path, test_video_dirs[i], image_name)
        # print(image_prediction_path)
    
        # This function can now also create images with bounding boxes and the cropped objects and save them to a location
        _, cropped_objects = plot_results_avenue(image, 0.5, boxes_list, im_size=(640, 360), display_img=False, save_path=image_prediction_path, crop_objects=True)
    
        objects_prediction_path = os.path.join(train_objects_path, test_video_dirs[i])
        
        save_cropped_images(cropped_objects, objects_prediction_path, image_prefix=image_name.split('.')[0])   
   
    # obj_det_dict[video_names[i]] = bbox_temp




['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16']
./datasets/Avenue Dataset/train__/01


0: 384x640 6 persons, 207.6ms
Speed: 1.4ms preprocess, 207.6ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 86.1ms
Speed: 1.2ms preprocess, 86.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 backpack, 15.7ms
Speed: 0.6ms preprocess, 15.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bird, 1 backpack, 1 handbag, 15.2ms
Speed: 0.6ms preprocess, 15.2ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 1 backpack, 1 handbag, 15.5ms
Speed: 0.5ms preprocess, 15.5ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 1 backpack, 1 handbag, 15.4ms
Speed: 0.6ms preprocess, 15.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 bird, 1 backpack, 1 handbag, 15.4ms
Speed: 0.6ms preprocess, 15.4ms inference, 1.0ms postprocess per imag

./datasets/Avenue Dataset/train__/02


0: 384x640 3 persons, 2 birds, 1 handbag, 15.0ms
Speed: 0.6ms preprocess, 15.0ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 bird, 1 handbag, 14.8ms
Speed: 0.4ms preprocess, 14.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 bird, 3 handbags, 14.8ms
Speed: 0.7ms preprocess, 14.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 bird, 1 handbag, 14.9ms
Speed: 0.5ms preprocess, 14.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 3 handbags, 14.9ms
Speed: 0.6ms preprocess, 14.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 2 handbags, 15.2ms
Speed: 0.6ms preprocess, 15.2ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 handbag, 14.8ms
Speed: 0.4ms preprocess, 14.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

./datasets/Avenue Dataset/train__/03



0: 384x640 7 persons, 2 birds, 1 handbag, 17.9ms
Speed: 0.9ms preprocess, 17.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 birds, 1 handbag, 17.8ms
Speed: 0.8ms preprocess, 17.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 birds, 17.8ms
Speed: 0.7ms preprocess, 17.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 birds, 1 backpack, 1 handbag, 17.7ms
Speed: 0.7ms preprocess, 17.7ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 birds, 1 handbag, 18.2ms
Speed: 0.8ms preprocess, 18.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 2 birds, 1 handbag, 17.7ms
Speed: 0.7ms preprocess, 17.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 2 birds, 1 handbag, 17.7ms
Speed: 0.7ms preprocess, 17.7ms inference, 0.9ms postprocess per im

./datasets/Avenue Dataset/train__/04



0: 384x640 3 persons, 1 backpack, 18.1ms
Speed: 0.6ms preprocess, 18.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 backpack, 18.2ms
Speed: 0.6ms preprocess, 18.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 backpack, 18.2ms
Speed: 0.7ms preprocess, 18.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 backpack, 18.2ms
Speed: 0.8ms preprocess, 18.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 17.1ms
Speed: 0.7ms preprocess, 17.1ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 15.6ms
Speed: 0.5ms preprocess, 15.6ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 backpack, 15.6ms
Speed: 0.7ms preprocess, 15.6ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 bird, 1 handbag, 15.8ms
Sp

./datasets/Avenue Dataset/train__/05



0: 384x640 11 persons, 1 handbag, 21.7ms
Speed: 0.7ms preprocess, 21.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 backpack, 1 handbag, 22.1ms
Speed: 0.7ms preprocess, 22.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 backpack, 1 handbag, 22.6ms
Speed: 0.8ms preprocess, 22.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 backpack, 1 handbag, 23.4ms
Speed: 0.7ms preprocess, 23.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 bird, 1 backpack, 1 handbag, 22.4ms
Speed: 0.9ms preprocess, 22.4ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 backpack, 1 handbag, 23.2ms
Speed: 0.7ms preprocess, 23.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 backpack, 1 handbag, 22.5ms
Speed: 0.8ms preprocess, 22.5ms inference, 1

./datasets/Avenue Dataset/train__/06


0: 384x640 6 persons, 1 handbag, 15.0ms
Speed: 0.5ms preprocess, 15.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 handbag, 15.2ms
Speed: 0.7ms preprocess, 15.2ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bird, 1 handbag, 15.0ms
Speed: 0.6ms preprocess, 15.0ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 bird, 1 handbag, 15.1ms
Speed: 0.5ms preprocess, 15.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bird, 2 handbags, 15.4ms
Speed: 0.7ms preprocess, 15.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bird, 2 handbags, 17.9ms
Speed: 0.5ms preprocess, 17.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 birds, 2 handbags, 18.0ms
Speed: 0.7ms preprocess, 18.0ms inference, 1.0ms postprocess per image at shape (1, 3, 3

./datasets/Avenue Dataset/train__/07



0: 384x640 6 persons, 2 handbags, 25.3ms
Speed: 0.7ms preprocess, 25.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 handbags, 25.3ms
Speed: 0.8ms preprocess, 25.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 handbags, 25.3ms
Speed: 1.0ms preprocess, 25.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 handbags, 25.3ms
Speed: 0.9ms preprocess, 25.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 handbag, 25.2ms
Speed: 0.9ms preprocess, 25.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 handbags, 25.3ms
Speed: 0.8ms preprocess, 25.3ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 handbags, 25.4ms
Speed: 0.9ms preprocess, 25.4ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 han

./datasets/Avenue Dataset/train__/08


0: 384x640 13 persons, 3 backpacks, 3 handbags, 29.5ms
Speed: 0.8ms preprocess, 29.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 3 backpacks, 3 handbags, 27.1ms
Speed: 1.3ms preprocess, 27.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 3 backpacks, 3 handbags, 25.8ms
Speed: 0.8ms preprocess, 25.8ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 3 backpacks, 3 handbags, 26.3ms
Speed: 1.0ms preprocess, 26.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 3 backpacks, 3 handbags, 25.9ms
Speed: 0.8ms preprocess, 25.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 3 backpacks, 3 handbags, 25.8ms
Speed: 0.9ms preprocess, 25.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 3 backpacks, 3 handbags, 51.4ms
Speed: 0.8ms preprocess, 51

./datasets/Avenue Dataset/train__/09


0: 384x640 6 persons, 1 backpack, 21.4ms
Speed: 0.8ms preprocess, 21.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 backpack, 1 handbag, 21.3ms
Speed: 0.7ms preprocess, 21.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 backpack, 21.3ms
Speed: 0.7ms preprocess, 21.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 backpack, 21.4ms
Speed: 0.7ms preprocess, 21.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 backpack, 20.0ms
Speed: 0.8ms preprocess, 20.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 20.0ms
Speed: 0.7ms preprocess, 20.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 19.9ms
Speed: 0.7ms preprocess, 19.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 20.0ms
Speed: 0.8m

./datasets/Avenue Dataset/train__/10


0: 384x640 13 persons, 2 backpacks, 1 handbag, 25.7ms
Speed: 0.8ms preprocess, 25.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 1 backpack, 1 handbag, 25.6ms
Speed: 0.8ms preprocess, 25.6ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 2 backpacks, 1 handbag, 1 suitcase, 25.6ms
Speed: 0.7ms preprocess, 25.6ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 1 backpack, 2 handbags, 25.7ms
Speed: 0.7ms preprocess, 25.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 2 backpacks, 2 handbags, 25.6ms
Speed: 0.7ms preprocess, 25.6ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 backpack, 1 handbag, 25.6ms
Speed: 0.8ms preprocess, 25.6ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 backpack, 25.6ms
Speed: 0.8ms preprocess, 25.6ms inf

./datasets/Avenue Dataset/train__/11



0: 384x640 14 persons, 25.9ms
Speed: 0.9ms preprocess, 25.9ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 bird, 1 handbag, 25.9ms
Speed: 0.7ms preprocess, 25.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 bird, 26.0ms
Speed: 0.7ms preprocess, 26.0ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 1 handbag, 25.9ms
Speed: 0.9ms preprocess, 25.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 handbag, 25.9ms
Speed: 0.7ms preprocess, 25.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 1 handbag, 25.8ms
Speed: 0.7ms preprocess, 25.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 1 handbag, 25.9ms
Speed: 0.9ms preprocess, 25.9ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 1 handbag

./datasets/Avenue Dataset/train__/12



0: 384x640 13 persons, 4 handbags, 25.8ms
Speed: 1.0ms preprocess, 25.8ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 5 handbags, 25.9ms
Speed: 0.9ms preprocess, 25.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 5 handbags, 26.0ms
Speed: 0.9ms preprocess, 26.0ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 6 handbags, 26.3ms
Speed: 0.7ms preprocess, 26.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 4 handbags, 26.3ms
Speed: 0.9ms preprocess, 26.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 5 handbags, 26.6ms
Speed: 0.8ms preprocess, 26.6ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 6 handbags, 26.5ms
Speed: 0.7ms preprocess, 26.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 perso

./datasets/Avenue Dataset/train__/13


0: 384x640 16 persons, 2 backpacks, 2 handbags, 21.8ms
Speed: 0.8ms preprocess, 21.8ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 2 backpacks, 2 handbags, 21.8ms
Speed: 0.7ms preprocess, 21.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 2 backpacks, 2 handbags, 21.8ms
Speed: 0.8ms preprocess, 21.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 2 backpacks, 2 handbags, 21.8ms
Speed: 0.9ms preprocess, 21.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 2 backpacks, 2 handbags, 21.8ms
Speed: 0.7ms preprocess, 21.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 2 backpacks, 2 handbags, 21.8ms
Speed: 0.8ms preprocess, 21.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 3 backpacks, 1 handbag, 21.8ms
Speed: 0.7ms preprocess, 21.

./datasets/Avenue Dataset/train__/14


0: 384x640 12 persons, 2 handbags, 23.8ms
Speed: 0.8ms preprocess, 23.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 handbag, 24.1ms
Speed: 0.9ms preprocess, 24.1ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 24.4ms
Speed: 0.9ms preprocess, 24.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 25.0ms
Speed: 0.8ms preprocess, 25.0ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 handbag, 24.4ms
Speed: 0.7ms preprocess, 24.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 24.4ms
Speed: 0.8ms preprocess, 24.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 handbag, 24.4ms
Speed: 0.8ms preprocess, 24.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 handbag, 24.6ms
Speed: 0.8ms prepr

./datasets/Avenue Dataset/train__/15



0: 384x640 14 persons, 3 handbags, 1 suitcase, 14.6ms
Speed: 0.6ms preprocess, 14.6ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 3 handbags, 1 suitcase, 14.4ms
Speed: 0.7ms preprocess, 14.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 backpack, 5 handbags, 1 suitcase, 20.6ms
Speed: 0.6ms preprocess, 20.6ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 5 handbags, 1 suitcase, 14.7ms
Speed: 0.8ms preprocess, 14.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 4 handbags, 1 suitcase, 14.4ms
Speed: 0.7ms preprocess, 14.4ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 5 handbags, 1 suitcase, 14.4ms
Speed: 0.6ms preprocess, 14.4ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 5 handbags, 1 suitcase, 14.4ms
Speed: 0.5ms preproce

./datasets/Avenue Dataset/train__/16



0: 384x640 17 persons, 1 backpack, 2 handbags, 1 suitcase, 21.8ms
Speed: 0.9ms preprocess, 21.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 18 persons, 1 backpack, 2 handbags, 1 suitcase, 21.6ms
Speed: 0.8ms preprocess, 21.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 18 persons, 1 backpack, 3 handbags, 1 suitcase, 21.4ms
Speed: 0.8ms preprocess, 21.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 18 persons, 1 backpack, 2 handbags, 1 suitcase, 20.5ms
Speed: 0.8ms preprocess, 20.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 17 persons, 1 backpack, 2 handbags, 1 suitcase, 24.7ms
Speed: 0.9ms preprocess, 24.7ms inference, 3.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 19 persons, 1 backpack, 2 handbags, 1 suitcase, 1 book, 22.5ms
Speed: 0.8ms preprocess, 22.5ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x

CPU times: user 13min 10s, sys: 1min 22s, total: 14min 33s
Wall time: 18min 26s


### Generate Test objects and predictions

In [63]:
%%capture
transform = T.Compose([
    T.Resize((640, 360)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
bbox_temp = {}
video_names = [video.split('/')[-1] for video in test_video_paths]
print(video_names)
obj_det_dict = {}

for i, video_path in enumerate(test_video_paths):
    print(video_path)
    image_names = [img.split('/')[-1] for img in list_image_files(test_video_paths[i])]

    frame_idx = 0
    for image_name in image_names:
        full_image_path = os.path.join(test_video_paths[i], image_name)
        
        image_tensor, image = preprocess(full_image_path, transform=transform)
        
        # probab, boxes = detect(image, model)
        with torch.no_grad():
            results = model(image)

        # # Process boxes and convert to Python lists with integers
        # boxes_list = []
        # for box in boxes:
        #     int_box = [int(elem.item()) for elem in box]  # Convert each element to an int and then to a Python list
        #     boxes_list.append(int_box)

        boxes_list = results[0].boxes.xyxy.int().tolist()

        # TODO: Save this dict
        bbox_temp[frame_idx] = boxes_list

        frame_idx += 1
    
        image_prediction_path = os.path.join(test_preds_path, test_video_dirs[i], image_name)
        # print(image_prediction_path)
    
        # This function can now also create images with bounding boxes and the cropped objects and save them to a location
        _, cropped_objects = plot_results_avenue(image, 0.5, boxes_list, im_size=(640, 360), display_img=False, save_path=image_prediction_path, crop_objects=True)
    
        objects_prediction_path = os.path.join(test_objects_path, test_video_dirs[i])
        
        save_cropped_images(cropped_objects, objects_prediction_path, image_prefix=image_name.split('.')[0])   
   
    obj_det_dict[video_names[i]] = bbox_temp


0: 384x640 6 persons, 1 handbag, 21.9ms
Speed: 0.8ms preprocess, 21.9ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bird, 1 handbag, 21.6ms
Speed: 0.9ms preprocess, 21.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 birds, 1 handbag, 21.7ms
Speed: 0.8ms preprocess, 21.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21']
./datasets/Avenue Dataset/test__/01


0: 384x640 6 persons, 2 birds, 1 handbag, 20.7ms
Speed: 0.7ms preprocess, 20.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 birds, 1 handbag, 20.7ms
Speed: 0.7ms preprocess, 20.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 birds, 1 handbag, 21.6ms
Speed: 0.7ms preprocess, 21.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 1 backpack, 1 handbag, 21.5ms
Speed: 0.8ms preprocess, 21.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 1 handbag, 19.9ms
Speed: 0.7ms preprocess, 19.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 1 backpack, 1 handbag, 19.1ms
Speed: 0.8ms preprocess, 19.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 1 handbag, 19.4ms
Speed: 0.9ms preprocess, 19.4ms inference, 1.

./datasets/Avenue Dataset/test__/02



0: 384x640 12 persons, 1 bird, 1 backpack, 3 handbags, 24.2ms
Speed: 0.7ms preprocess, 24.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 birds, 1 backpack, 3 handbags, 24.1ms
Speed: 0.7ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 2 birds, 2 backpacks, 4 handbags, 25.4ms
Speed: 0.7ms preprocess, 25.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 birds, 2 backpacks, 3 handbags, 25.6ms
Speed: 0.7ms preprocess, 25.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 2 birds, 2 backpacks, 5 handbags, 26.2ms
Speed: 0.6ms preprocess, 26.2ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 2 birds, 2 backpacks, 3 handbags, 24.4ms
Speed: 0.8ms preprocess, 24.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 2 birds

./datasets/Avenue Dataset/test__/03



0: 384x640 7 persons, 1 bird, 1 handbag, 18.4ms
Speed: 0.7ms preprocess, 18.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 2 handbags, 19.0ms
Speed: 0.7ms preprocess, 19.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 2 handbags, 19.0ms
Speed: 0.8ms preprocess, 19.0ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 3 birds, 1 handbag, 18.4ms
Speed: 0.9ms preprocess, 18.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 3 handbags, 18.3ms
Speed: 0.8ms preprocess, 18.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 bird, 1 handbag, 19.0ms
Speed: 0.7ms preprocess, 19.0ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 birds, 2 handbags, 19.1ms
Speed: 0.7ms preprocess, 19.1ms inference, 1.0ms postprocess per i

./datasets/Avenue Dataset/test__/04


0: 384x640 9 persons, 2 handbags, 19.2ms
Speed: 0.8ms preprocess, 19.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 bird, 3 handbags, 18.9ms
Speed: 0.7ms preprocess, 18.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 3 handbags, 19.1ms
Speed: 0.6ms preprocess, 19.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 2 handbags, 19.1ms
Speed: 0.8ms preprocess, 19.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 3 handbags, 19.0ms
Speed: 0.7ms preprocess, 19.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 handbag, 18.3ms
Speed: 0.7ms preprocess, 18.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 2 handbags, 18.4ms
Speed: 0.8ms preprocess, 18.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 p

./datasets/Avenue Dataset/test__/05



0: 384x640 10 persons, 1 bird, 1 handbag, 18.6ms
Speed: 0.8ms preprocess, 18.6ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 bird, 1 handbag, 18.4ms
Speed: 0.7ms preprocess, 18.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 bird, 2 handbags, 31.0ms
Speed: 2.8ms preprocess, 31.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 bird, 2 handbags, 18.4ms
Speed: 0.8ms preprocess, 18.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 bird, 1 handbag, 19.0ms
Speed: 0.6ms preprocess, 19.0ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 bird, 1 handbag, 19.0ms
Speed: 0.6ms preprocess, 19.0ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 bird, 1 handbag, 19.1ms
Speed: 0.7ms preprocess, 19.1ms inference, 1.1ms postprocess per imag

./datasets/Avenue Dataset/test__/06



0: 384x640 9 persons, 1 bird, 1 backpack, 2 handbags, 18.1ms
Speed: 0.7ms preprocess, 18.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 backpacks, 2 handbags, 18.1ms
Speed: 0.7ms preprocess, 18.1ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 backpack, 1 handbag, 18.2ms
Speed: 0.7ms preprocess, 18.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 bird, 1 backpack, 1 handbag, 18.0ms
Speed: 0.6ms preprocess, 18.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 bird, 1 backpack, 1 handbag, 18.0ms
Speed: 0.6ms preprocess, 18.0ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 bird, 1 backpack, 1 handbag, 18.0ms
Speed: 0.7ms preprocess, 18.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 2 backpacks, 2 handbags, 18.4ms
Speed: 0.8

./datasets/Avenue Dataset/test__/07


0: 384x640 8 persons, 1 bird, 23.1ms
Speed: 0.7ms preprocess, 23.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 bird, 22.9ms
Speed: 0.8ms preprocess, 22.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 2 birds, 1 backpack, 23.0ms
Speed: 0.7ms preprocess, 23.0ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 2 birds, 1 backpack, 22.8ms
Speed: 0.7ms preprocess, 22.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 handbag, 23.0ms
Speed: 0.7ms preprocess, 23.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 bird, 23.5ms
Speed: 0.7ms preprocess, 23.5ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 bird, 23.0ms
Speed: 0.7ms preprocess, 23.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 23.3

./datasets/Avenue Dataset/test__/08



0: 384x640 7 persons, 1 handbag, 20.9ms
Speed: 0.7ms preprocess, 20.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 handbag, 20.9ms
Speed: 0.6ms preprocess, 20.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 handbags, 20.2ms
Speed: 0.7ms preprocess, 20.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 handbags, 19.9ms
Speed: 0.9ms preprocess, 19.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 handbags, 19.7ms
Speed: 0.8ms preprocess, 19.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 handbags, 19.8ms
Speed: 0.6ms preprocess, 19.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 handbag, 19.6ms
Speed: 0.7ms preprocess, 19.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 handb

./datasets/Avenue Dataset/test__/09



0: 384x640 4 persons, 15.1ms
Speed: 0.7ms preprocess, 15.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 15.2ms
Speed: 0.4ms preprocess, 15.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 14.4ms
Speed: 0.5ms preprocess, 14.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 14.5ms
Speed: 0.7ms preprocess, 14.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 14.6ms
Speed: 0.5ms preprocess, 14.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 15.1ms
Speed: 0.4ms preprocess, 15.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 14.5ms
Speed: 0.5ms preprocess, 14.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 14.5ms
Speed: 0.7ms preprocess, 14.5ms inference, 0.9ms postprocess per image at shape (

./datasets/Avenue Dataset/test__/10



0: 384x640 7 persons, 1 handbag, 14.6ms
Speed: 0.5ms preprocess, 14.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 15.4ms
Speed: 0.5ms preprocess, 15.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 backpack, 15.3ms
Speed: 0.7ms preprocess, 15.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 backpack, 1 handbag, 14.6ms
Speed: 0.7ms preprocess, 14.6ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 backpack, 1 handbag, 14.6ms
Speed: 0.6ms preprocess, 14.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 backpack, 1 handbag, 14.6ms
Speed: 0.5ms preprocess, 14.6ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 backpack, 1 handbag, 15.2ms
Speed: 0.5ms preprocess, 15.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 6

./datasets/Avenue Dataset/test__/11


0: 384x640 11 persons, 1 backpack, 1 handbag, 15.1ms
Speed: 0.5ms preprocess, 15.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 backpack, 1 handbag, 15.1ms
Speed: 0.6ms preprocess, 15.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 2 backpacks, 1 handbag, 15.1ms
Speed: 0.6ms preprocess, 15.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 2 backpacks, 1 handbag, 15.1ms
Speed: 0.6ms preprocess, 15.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 backpacks, 1 handbag, 15.2ms
Speed: 0.6ms preprocess, 15.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 2 backpacks, 1 handbag, 20.5ms
Speed: 0.5ms preprocess, 20.5ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 backpack, 1 handbag, 15.5ms
Speed: 0.6ms preprocess, 15.5ms infer

./datasets/Avenue Dataset/test__/12


0: 384x640 3 persons, 24.9ms
Speed: 0.9ms preprocess, 24.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 24.9ms
Speed: 0.7ms preprocess, 24.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 24.9ms
Speed: 0.7ms preprocess, 24.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 21.4ms
Speed: 0.8ms preprocess, 21.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 20.3ms
Speed: 0.7ms preprocess, 20.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 20.4ms
Speed: 0.7ms preprocess, 20.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 20.4ms
Speed: 0.7ms preprocess, 20.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 20.3ms
Speed: 0.8ms preprocess, 20.3ms inference, 1.1ms postprocess per image at shape (1

./datasets/Avenue Dataset/test__/13


0: 384x640 1 person, 14.5ms
Speed: 0.6ms preprocess, 14.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 14.3ms
Speed: 0.5ms preprocess, 14.3ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 14.6ms
Speed: 0.7ms preprocess, 14.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 15.2ms
Speed: 0.6ms preprocess, 15.2ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 14.9ms
Speed: 0.6ms preprocess, 14.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 14.3ms
Speed: 0.5ms preprocess, 14.3ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 13.1ms
Speed: 0.6ms preprocess, 13.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 14.0ms
Speed: 0.5ms preprocess, 14.0ms inference, 0.9ms postprocess per image at shape (1, 3, 384

./datasets/Avenue Dataset/test__/14



0: 384x640 10 persons, 1 bird, 2 backpacks, 1 handbag, 18.7ms
Speed: 0.6ms preprocess, 18.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 3 backpacks, 1 handbag, 18.7ms
Speed: 0.8ms preprocess, 18.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 backpacks, 1 handbag, 19.5ms
Speed: 0.7ms preprocess, 19.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 3 backpacks, 19.4ms
Speed: 0.7ms preprocess, 19.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 2 backpacks, 1 handbag, 19.7ms
Speed: 0.7ms preprocess, 19.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 bird, 2 backpacks, 1 handbag, 19.6ms
Speed: 0.6ms preprocess, 19.6ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 bird, 2 backpacks, 1 handbag, 19.8ms
Speed: 0.7ms preprocess, 1

./datasets/Avenue Dataset/test__/15


0: 384x640 12 persons, 1 backpack, 1 handbag, 27.4ms
Speed: 0.8ms preprocess, 27.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 backpack, 2 handbags, 27.0ms
Speed: 0.8ms preprocess, 27.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 2 backpacks, 2 handbags, 26.8ms
Speed: 0.7ms preprocess, 26.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 handbags, 27.5ms
Speed: 0.9ms preprocess, 27.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 2 handbags, 26.8ms
Speed: 0.7ms preprocess, 26.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 handbag, 26.2ms
Speed: 0.7ms preprocess, 26.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 backpack, 1 handbag, 27.2ms
Speed: 0.7ms preprocess, 27.2ms inference, 1.3ms postprocess per image a

./datasets/Avenue Dataset/test__/16



0: 384x640 9 persons, 2 handbags, 18.3ms
Speed: 0.7ms preprocess, 18.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 handbags, 18.3ms
Speed: 0.7ms preprocess, 18.3ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 handbags, 19.3ms
Speed: 0.7ms preprocess, 19.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 4 handbags, 18.3ms
Speed: 0.8ms preprocess, 18.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 handbags, 19.2ms
Speed: 0.7ms preprocess, 19.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 handbags, 19.2ms
Speed: 0.7ms preprocess, 19.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 handbags, 19.0ms
Speed: 0.8ms preprocess, 19.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 ha

./datasets/Avenue Dataset/test__/17



0: 384x640 10 persons, 2 backpacks, 1 handbag, 19.7ms
Speed: 0.7ms preprocess, 19.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 2 backpacks, 2 handbags, 19.8ms
Speed: 0.7ms preprocess, 19.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 2 backpacks, 2 handbags, 18.9ms
Speed: 0.8ms preprocess, 18.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 backpacks, 2 handbags, 19.2ms
Speed: 0.8ms preprocess, 19.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 backpacks, 2 handbags, 19.0ms
Speed: 0.7ms preprocess, 19.0ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 backpacks, 2 handbags, 18.9ms
Speed: 0.6ms preprocess, 18.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 backpacks, 2 handbags, 19.0ms
Speed: 0.7ms preprocess, 19

./datasets/Avenue Dataset/test__/18



0: 384x640 10 persons, 1 handbag, 22.9ms
Speed: 0.7ms preprocess, 22.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 22.6ms
Speed: 0.7ms preprocess, 22.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 handbag, 22.8ms
Speed: 0.8ms preprocess, 22.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 handbag, 22.7ms
Speed: 0.7ms preprocess, 22.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 bird, 1 handbag, 23.1ms
Speed: 0.9ms preprocess, 23.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 bird, 1 handbag, 23.1ms
Speed: 0.7ms preprocess, 23.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 bird, 1 backpack, 1 handbag, 23.2ms
Speed: 0.7ms preprocess, 23.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0

./datasets/Avenue Dataset/test__/19



0: 384x640 16 persons, 4 handbags, 23.7ms
Speed: 0.8ms preprocess, 23.7ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 4 handbags, 22.8ms
Speed: 0.7ms preprocess, 22.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 5 handbags, 23.8ms
Speed: 0.7ms preprocess, 23.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 4 handbags, 22.7ms
Speed: 0.6ms preprocess, 22.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 4 handbags, 23.7ms
Speed: 0.7ms preprocess, 23.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 3 handbags, 23.8ms
Speed: 0.8ms preprocess, 23.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 5 handbags, 22.8ms
Speed: 0.7ms preprocess, 22.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 perso

./datasets/Avenue Dataset/test__/20



0: 384x640 10 persons, 1 bird, 1 handbag, 26.8ms
Speed: 0.7ms preprocess, 26.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 bird, 1 handbag, 26.9ms
Speed: 0.8ms preprocess, 26.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 bird, 1 backpack, 26.9ms
Speed: 0.7ms preprocess, 26.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 bird, 1 handbag, 25.7ms
Speed: 0.8ms preprocess, 25.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 birds, 25.7ms
Speed: 0.7ms preprocess, 25.7ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 bird, 25.8ms
Speed: 0.8ms preprocess, 25.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 bird, 1 handbag, 26.3ms
Speed: 0.9ms preprocess, 26.3ms inference, 1.2ms postprocess per image at shape (1, 3, 3

./datasets/Avenue Dataset/test__/21



0: 384x640 9 persons, 2 birds, 1 handbag, 22.1ms
Speed: 0.9ms preprocess, 22.1ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 bird, 1 handbag, 1 baseball bat, 23.2ms
Speed: 0.7ms preprocess, 23.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 2 birds, 1 backpack, 1 handbag, 23.2ms
Speed: 0.8ms preprocess, 23.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 birds, 2 handbags, 23.1ms
Speed: 0.7ms preprocess, 23.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 2 birds, 1 handbag, 23.5ms
Speed: 0.7ms preprocess, 23.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 2 birds, 1 handbag, 23.4ms
Speed: 0.7ms preprocess, 23.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 bird, 1 handbag, 24.4ms
Speed: 0.7ms preprocess, 24.4ms inferenc

CPU times: user 12min 21s, sys: 1min 26s, total: 13min 47s
Wall time: 16min 38s


In [52]:
import pickle
# Save the object to a file
with open('obj_dect_avenue_yolov8', 'wb') as file:
    pickle.dump(obj_det_dict, file)