In [1]:
import cv2
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator, colors

from collections import defaultdict

In [2]:
track_history = defaultdict(lambda: [])

model = YOLO("yolov8m-seg.pt")   # segmentation model

cap = cv2.VideoCapture("../videos/home_2.avi")
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))

out = cv2.VideoWriter('instance-segmentation-object-tracking.avi', cv2.VideoWriter_fourcc(*'MJPG'), fps, (w, h))



while True:
    ret, im0 = cap.read()
    if not ret:
        print("Video frame is empty or video processing has been successfully completed.")
        break
        
    annotator = Annotator(im0, line_width=2)

    results = model.track(im0, persist=True)

    if results[0].boxes.id is not None and results[0].masks is not None:
        masks = results[0].masks.xy
        track_ids = results[0].boxes.id.int().cpu().tolist()

        for mask, track_id in zip(masks, track_ids):
            annotator.seg_bbox(mask=mask,
                               mask_color=colors(track_id, True),
                               track_label=str(track_id))

    out.write(im0)
    cv2.imshow("instance-segmentation-object-tracking", im0)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

out.release()
cap.release()
cv2.destroyAllWindows()

Downloading https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8m-seg.pt to 'yolov8m-seg.pt'...


100%|██████████| 52.4M/52.4M [00:03<00:00, 17.7MB/s]



0: 384x640 1 person, 161.2ms
Speed: 2.1ms preprocess, 161.2ms inference, 74.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 15.1ms
Speed: 1.0ms preprocess, 15.1ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 5 chairs, 1 dining table, 13.4ms
Speed: 2.0ms preprocess, 13.4ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 6 chairs, 1 dining table, 13.1ms
Speed: 2.0ms preprocess, 13.1ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 6 chairs, 1 dining table, 14.7ms
Speed: 2.0ms preprocess, 14.7ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 6 chairs, 1 dining table, 13.6ms
Speed: 2.0ms preprocess, 13.6ms inference, 4.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 6 chairs, 1 dining table, 18.2ms
Speed: 2.0ms preprocess, 18.2ms inference, 3.0ms postprocess per image at sha