In [1]:
from ultralytics import YOLO
import torch
import cv2
import numpy as np

# Load the YOLOv8 model (pre-trained on COCO dataset)
model = YOLO("yolov8x.pt")  # You can use yolov8s, yolov8m, or yolov8l for better accuracy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define input and output video
video_path = "video.mp4"
output_path = "single_object_detection.avi"

# Set up video capture
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define the output video writer
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))




In [2]:
def to_tensor(frame):
    # Convert the frame (BGR) to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Convert to tensor and normalize to [0, 1] range
    frame_tensor = torch.tensor(frame_rgb).permute(2, 0, 1).unsqueeze(0).float() / 255.0
    
    # Convert the tensor to uint8 (0-255 range)
    frame_tensor_uint8 = (frame_tensor * 255).clamp(0, 255).byte()
    
    return frame_tensor_uint8

def run_model(frame):
    CONFIDENCE_THRESHOLD_LIMIT = 0.3
    model.to(device)
    batch = [frame]
    with torch.no_grad():
        result = model(batch, device=device)[0]

    bboxes = np.array(result.boxes.xyxy.cpu(), dtype="int")
    classes = np.array(result.boxes.cls.cpu(), dtype="int")
    confidence = np.array(result.boxes.conf.cpu(), dtype="float")
    BOX_COLOUR = (37, 245, 75)
    for cls, bbox, conf in zip(classes, bboxes, confidence):
        (x, y, x2, y2) = bbox
        object_name = model.names[cls]
        if conf < CONFIDENCE_THRESHOLD_LIMIT:
            continue
        if object_name != "truck" and object_name != "car": 
            continue
        centroid_x = (x + x2) // 2
        centroid_y = (y + y2) // 2
        cv2.circle(frame, (centroid_x, centroid_y), 5, (0, 0, 255), -1)
        cv2.rectangle(frame, (x, y), (x2, y2), BOX_COLOUR, 2)

    out.write(frame)
    return


In [3]:

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    run_model(frame)
    if cv2.waitKey(1) == ord("q"):
        break
cap.release()
out.release()
cv2.destroyAllWindows()

print("Video processing complete. Output saved at:", output_path)


0: 640x384 1 car, 1 truck, 131.8ms
Speed: 5.4ms preprocess, 131.8ms inference, 61.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 1 truck, 10.6ms
Speed: 1.2ms preprocess, 10.6ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 1 truck, 9.2ms
Speed: 1.2ms preprocess, 9.2ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 1 truck, 9.0ms
Speed: 1.1ms preprocess, 9.0ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 1 truck, 9.0ms
Speed: 1.1ms preprocess, 9.0ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 9.0ms
Speed: 1.1ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 9.0ms
Speed: 1.2ms preprocess, 9.0ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 1 truck, 9.1ms
Speed: 1.1ms preprocess, 9.1ms inference, 0.9ms postprocess per ima