#### Мотивация

В ходе изучения YOLO object tracking было выяснено, что иногда на кадрах теряются боксы с затреканым покупателем. При обучении такой кадр не попадает в обучающую выборку. Чтобы решить эту проблему, можно алгоритмически дополнять кадры потерянными боксами - а именно, брать среднее от координат последнего затреканного бокса, и первого после кадров без боксов.


##### Про conf:

Если при трекинге ставить его сильно низким, например 0.1, то сталкиваемся с новой проблемой: в качестве человека выделяются части полок с товарами в магазине, или любые другие объекты вблизи покупателя. При обучении берем один бокс с кадра, и по нему вырезаем. Поэтому боксы с мусором только зашумлят обучающую выборку

#### Примеры
[Пример](https://drive.google.com/drive/u/0/folders/16RonDOowTW_iqU8cw37MXOckbxQxqb6F): видео, на котором модель YOLOv8 nano "потеряла" 387 боксов, на большей части которых явно есть покупатель и даже прослеживается кража.

In [3]:
import cv2
from ultralytics import YOLO
from PIL import Image
from tqdm import tqdm

In [9]:
example_video_withBoxes_path = '/home/anastasia/Desktop/MobileSSD_test/test_videos/Shoplifting7.mp4'
example_video_raw = '/home/anastasia/Desktop/project_dataset/Shoplifting7.mp4'

In [12]:
def process_video_with_tracking(model, input_video_path, output_video_path, output_missed_path):
    cap = cv2.VideoCapture(input_video_path)
    missed_boxes_counter = 0

    if not cap.isOpened():
        raise Exception("Error: Could not open video file.")

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))


    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
    out_missed = cv2.VideoWriter(output_missed_path, fourcc, fps, (frame_width, frame_height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        results = model.track(frame, iou=0.4, conf=0.25, persist=True, imgsz=608, verbose=False, tracker="bytetrack.yaml", classes=0)
    
        if results[0].boxes.id is not None: # this will ensure that id is not None -> exist tracks
            boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
            ids = results[0].boxes.id.cpu().numpy().astype(int)

            for box, id in zip(boxes, ids):
                additional_area = 1/10
                x_min, y_min, x_max, y_max = box
                width, height = frame_width, frame_height
                box_width = x_max - x_min
                box_height = y_max - y_min
                x_min_2 = max(0, x_min - box_width*additional_area)
                y_min_2 = max(0, y_min - box_height*additional_area)
                x_max_2 = min(width, x_max + box_width*additional_area)
                y_max_2 = min(height, y_max + box_height*additional_area)
                area = (x_min_2, y_min_2, x_max_2, y_max_2)
                cropped_img = Image.fromarray(frame, 'RGB').crop(area)
                cropped_frame = frame[box[1]:box[3], box[0]:box[2]]
                color = (0, 255, 255)
                cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3],), color, 2)
                cv2.putText(
                    frame,
                    f"Id {id}",
                    (box[0], box[1]),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.70,
                    (0, 255, 255),
                    2,
                )
        else:
            missed_boxes_counter += 1
            frame_missed = frame
        
        out.write(frame)
        out_missed.write(frame_missed)

    cap.release()
    out.release()
    out_missed.release()

    return missed_boxes_counter

In [13]:
model = YOLO('yolov8n.pt')
model.fuse()
sum_missed = process_video_with_tracking(model, example_video_raw,"fill_missing_boxes/example.mp4", 'fill_missing_boxes/missed.mp4')
sum_missed

YOLOv8n summary (fused): 168 layers, 3151904 parameters, 0 gradients, 8.7 GFLOPs


387

In [44]:
import cv2
import numpy as np
from PIL import Image

def process_video_with_tracking_filling(model, input_video_path, output_video_path):
    cap = cv2.VideoCapture(input_video_path)

    if not cap.isOpened():
        raise Exception("Error: Could not open video file.")

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))


    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    previous_boxes = None
    previous_ids = None
    seek_box_frames = {}

    def fill_missed_boxes(frame_count, last_boxes, next_boxes):
        interpolated_boxes = {}
        for box_key in last_boxes:
            if box_key in next_boxes:
                interpolated_boxes[box_key] = []
                for i in range(len(next_boxes[box_key])):
                    interpolated_coord = np.linspace(
                        last_boxes[box_key][i], 
                        next_boxes[box_key][i], 
                        frame_count+2, 
                        dtype=int
                    )[1:frame_count+1]
                    interpolated_boxes[box_key].append(interpolated_coord.tolist())
        return interpolated_boxes

    frame_idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        results = model.track(frame, iou=0.4, conf=0.25, persist=True, imgsz=608, verbose=False, tracker="bytetrack.yaml", classes=0)
        frame_boxes = {}
        if results[0].boxes.id is not None:
            boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
            ids = results[0].boxes.id.cpu().numpy().astype(int)
            for box, id in zip(boxes, ids):
                frame_boxes[id] = box
                additional_area = 1/10
                x_min, y_min, x_max, y_max = box
                width, height = frame_width, frame_height
                box_width = x_max - x_min
                box_height = y_max - y_min
                x_min_2 = max(0, x_min - box_width * additional_area)
                y_min_2 = max(0, y_min - box_height * additional_area)
                x_max_2 = min(width, x_max + box_width * additional_area)
                y_max_2 = min(height, y_max + box_height * additional_area)
                box_area = (x_min_2, y_min_2, x_max_2, y_max_2)
                cropped_img = Image.fromarray(frame, 'RGB').crop(box_area)
                color = (0, 255, 255)
                cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), color, 2)
                cv2.putText(
                    frame,
                    f"Id {id}",
                    (box[0], box[1]),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.70,
                    (0, 255, 255),
                    2,
                )      
        if len(frame_boxes) == 0:
            seek_box_frames[frame_idx] = {
                "frame": frame,
                "prev_boxes": previous_boxes,
                "next_boxes": None
            }
        else:
            if not previous_boxes:
                previous_boxes = frame_boxes
                out.write(frame)
            else:
                if seek_box_frames:
                    min_seek_idx = min(seek_box_frames)
                    frame_interval = frame_idx - min_seek_idx - 1

                    if frame_interval > 0:
                        interpolated_boxes = fill_missed_boxes(frame_interval, previous_boxes, frame_boxes)
                        for missing_frame_idx in range(frame_interval):
                            seek_frame_idx = min_seek_idx + missing_frame_idx + 1
                            if seek_frame_idx in seek_box_frames:
                                seek_frame = seek_box_frames.pop(seek_frame_idx)["frame"]

                                for id in interpolated_boxes.keys():
                                    valid_id = id.item()
                                    # box = interpolated_boxes[valid_id][:, missing_frame_idx]
                                    box = np.array(interpolated_boxes[id])[:, missing_frame_idx]
                                    color = (255, 0, 0)
                                    cv2.rectangle(seek_frame, (box[0], box[1]), (box[2], box[3]), color, 2)
                                    cv2.putText(
                                        seek_frame,
                                        f"Id {id}",
                                        (box[0], box[1]),
                                        cv2.FONT_HERSHEY_SIMPLEX,
                                        0.70,
                                        color,
                                        2,
                                    )
                                out.write(seek_frame)
                previous_boxes = frame_boxes
                out.write(frame)
        
        frame_idx += 1

    while seek_box_frames:
        seek_frame = seek_box_frames.popitem()[1]["frame"]
        out.write(seek_frame)

    cap.release()
    out.release()

    return

In [43]:
process_video_with_tracking_filling(model, example_video_raw,"fill_missing_boxes/example_test_filling.mp4")

TO-DO: добавить паддинг

TO-DO: заполнять только если координаты соседних непропущенных боксов не сильно отличаются