In [2]:
import cv2
import numpy as np
from ultralytics import YOLO


# defining function for creating a writer (for mp4 videos)
def create_video_writer(video_cap, output_filename):
    # grab the width, height, and fps of the frames in the video stream.
    frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(video_cap.get(cv2.CAP_PROP_FPS))
    # initialize the FourCC and a video writer object
    fourcc = cv2.VideoWriter_fourcc(*'MP4V')
    writer = cv2.VideoWriter(output_filename, fourcc, fps,
                             (frame_width, frame_height))
    return writer

In [3]:
import cv2
import numpy as np
from ultralytics import YOLO


from collections import defaultdict


# object classes
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]

# Load the YOLOv8 model
model = YOLO('yolov9c.pt')

# Open the video file
video_path = "DJI_0015.MOV"
cap = cv2.VideoCapture(video_path)

# Store the track history
track_history = defaultdict(lambda: [])
# Store the class history
cls_history = defaultdict(lambda: [])

frame_number = 0

writer = create_video_writer(cap, f"{video_path}_annoted.mp4")

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        frame_number += 1
        # Run YOLOv8 tracking on the frame, persisting tracks between frames
        # detect only classes=[2,5,7], car, bus, truck
        results = model.track(frame, persist=True, conf=0.002, classes=[0, 2, 5, 7])
        

        # Visualize the results on the frame
        annotated_frame = results[0].plot()
        # Get the boxes and track IDs
        
        if results[0].boxes.id is not None:
        
            boxes = results[0].boxes.xywh.cpu()
            clss = results[0].boxes.cls
            track_ids = results[0].boxes.id.int().cpu().tolist()


            # Plot the tracks
            for box, track_id, cls in zip(boxes, track_ids, clss):
                x, y, w, h = box
                track = track_history[track_id]
                ch = cls_history[track_id]
                track.append((float(x), float(y)))  # x, y center point
                ch.append((classNames[int(cls)], frame_number))
                # class name
                # print("Class name -->", classNames[int(cls)])

                # Draw the tracking lines
                points = np.hstack(track).astype(np.int32).reshape((-1, 1, 2))
                cv2.polylines(annotated_frame, [points], isClosed=False, color=(255, 255, 0), thickness=2)
        writer.write(annotated_frame)
        # Display the annotated frame
        cv2.imshow("YOLOv8 Tracking", cv2.resize(annotated_frame, (1920, 1080)))

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
writer.release()
cv2.destroyAllWindows()

OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'



0: 352x640 72 persons, 5 cars, 15 trucks, 1327.4ms
Speed: 6.1ms preprocess, 1327.4ms inference, 479.6ms postprocess per image at shape (1, 3, 352, 640)


qt.qpa.plugin: Could not find the Qt platform plugin "wayland" in "/home/arthur/.local/lib/python3.10/site-packages/cv2/qt/plugins"



0: 352x640 83 persons, 3 cars, 17 trucks, 1237.6ms
Speed: 21.4ms preprocess, 1237.6ms inference, 5.1ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 98 persons, 3 cars, 12 trucks, 350.2ms
Speed: 1.8ms preprocess, 350.2ms inference, 0.8ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 90 persons, 4 cars, 13 trucks, 337.1ms
Speed: 10.0ms preprocess, 337.1ms inference, 1.2ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 85 persons, 4 cars, 14 trucks, 310.5ms
Speed: 1.8ms preprocess, 310.5ms inference, 1.1ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 86 persons, 5 cars, 13 trucks, 283.2ms
Speed: 1.7ms preprocess, 283.2ms inference, 0.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 85 persons, 6 cars, 14 trucks, 298.3ms
Speed: 1.7ms preprocess, 298.3ms inference, 1.3ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 86 persons, 7 cars, 14 trucks, 333.5ms
Speed: 1.9ms preprocess, 333.5ms inference, 1.4ms p

KeyboardInterrupt: 

: 

In [4]:
import cv2
import numpy as np
from ultralytics import YOLO


from collections import defaultdict


# object classes
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]

# Load the YOLOv8 model
model = YOLO('yolov9c.pt')

# Open the video file
video_path = "DJI_0015.MOV"
cap = cv2.VideoCapture(video_path)

# Store the track history
track_history = defaultdict(lambda: [])
# Store the class history
cls_history = defaultdict(lambda: [])

frame_number = 0

writer = create_video_writer(cap, f"{video_path}_annoted.mp4")

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        frame_number += 1
        # Run YOLOv8 tracking on the frame, persisting tracks between frames
        # detect only classes=[2,5,7], car, bus, truck
        results = model.predict(frame, conf=0.04, classes=[0, 2, 5, 7])
        

        # Visualize the results on the frame
        annotated_frame = results[0].plot()
        # Get the boxes and track IDs


        writer.write(annotated_frame)
        # Display the annotated frame
        cv2.imshow("YOLOv8 Tracking", cv2.resize(annotated_frame, (1920, 1080)))

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
writer.release()
cv2.destroyAllWindows()

OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'



0: 352x640 7 persons, 4 trucks, 373.3ms
Speed: 2.6ms preprocess, 373.3ms inference, 1.3ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 9 persons, 1 car, 4 trucks, 300.7ms
Speed: 2.1ms preprocess, 300.7ms inference, 1.3ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 16 persons, 4 trucks, 324.5ms
Speed: 2.8ms preprocess, 324.5ms inference, 0.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 14 persons, 1 car, 4 trucks, 299.5ms
Speed: 2.5ms preprocess, 299.5ms inference, 1.3ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 10 persons, 2 cars, 3 trucks, 328.7ms
Speed: 3.8ms preprocess, 328.7ms inference, 0.8ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 9 persons, 2 cars, 3 trucks, 401.0ms
Speed: 2.4ms preprocess, 401.0ms inference, 4.3ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 10 persons, 2 cars, 3 trucks, 365.5ms
Speed: 3.0ms preprocess, 365.5ms inference, 0.8ms postprocess per image at shape (1