In [None]:
from ultralytics import YOLO
import torch
import cv2
from PIL import Image
import numpy as np
from shapely.geometry import Polygon, Point

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train():
    model = YOLO("yolov8x.pt")  # You can use yolov8s, yolov8m, or yolov8l for better accuracy
    results = model.train(data="./VisDrone.yaml", project='path/to/model', epochs=100, imgsz=640, save=True, save_period=10, device=device)
# train()
model = YOLO("path/to/model") 
# Define input and output video
video_path = "multipleobjects.mp4"
output_path = "multipleobjectsdetection.avi"

# Set up video capture 
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
T = frame_count / fps  # duration in seconds


# Define the output video writer
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))



In [5]:
### code for counting settings

counting_regions = [
 
   {
       "name": "outbound",
       "polygon": Polygon([(0, height*0.45), (0, height), (width, height), (width, height*0.45)]),  # Polygon points (tl,bl,br,tr)
       "counts": 0,
       "dragging": False,
       "region_color": (37, 255, 225),  # BGR Value
       "text_color": (0, 0, 0),  # Region Text Color
   },
   {
       "name": "inbound",
       "polygon": Polygon([(0, 0), (0, height*0.45), (width, height*0.45), (width, 0)]),  # Polygon points (tl,bl,br,tr)
       "counts": 0,
       "dragging": False,
       "region_color": (37, 255, 225),  # BGR Value
       "text_color": (0, 0, 0),  # Region Text Color
   }
]

In [6]:
def to_tensor(frame):
    # Convert the frame (BGR) to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Convert to tensor and normalize to [0, 1] range
    frame_tensor = torch.tensor(frame_rgb).permute(2, 0, 1).unsqueeze(0).float() / 255.0
    
    # Convert the tensor to uint8 (0-255 range)
    frame_tensor_uint8 = (frame_tensor * 255).clamp(0, 255).byte()
    
    return frame_tensor_uint8

def run_model(frame):
    CONFIDENCE_THRESHOLD_LIMIT = 0.3
    model.to(device)
    batch = [frame]
    with torch.no_grad():
        result = model.track(
            batch, 
            persist=True, 
            # tracker="custom_tracker.yaml",  
            iou=0.5,
            agnostic_nms=True,
            device=device)[0]

    bboxes = np.array(result.boxes.xyxy.cpu(), dtype="int")
    classes = np.array(result.boxes.cls.cpu(), dtype="int")
    confidence = np.array(result.boxes.conf.cpu(), dtype="float")
    BOX_COLOUR = (37, 245, 75)
    for cls, bbox, conf in zip(classes, bboxes, confidence):
        (x, y, x2, y2) = bbox
        object_name = model.names[cls]
        if conf < CONFIDENCE_THRESHOLD_LIMIT:
            continue
        # if object_name != "truck" and object_name != "car": 
        #     continue
        label = f"{object_name}"
        centroid_x = (x + x2) // 2
        centroid_y = (y + y2) // 2
        cv2.circle(frame, (centroid_x, centroid_y), 5, (0, 0, 255), -1)
        cv2.rectangle(frame, (x, y), (x2, y2), BOX_COLOUR, 2)
        cv2.putText(frame, label, (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, thickness = 2, color=(0, 0, 255))
        for region in counting_regions:
            # print(f"point {centroid_x} and {centroid_y}")
            if region["polygon"].contains(Point((centroid_x, centroid_y))):
                region["counts"] += 1

    for region in counting_regions:
        polygon_coords = np.array(region["polygon"].exterior.coords, dtype=np.int32)
        label = f"{region['name']}: {region['counts']/T:.2f} vehicles/second"
        x, y = polygon_coords[1]
        cv2.polylines(frame, [polygon_coords], isClosed=True, color=counting_regions[0]['region_color'], thickness=5)
        cv2.putText(frame, label, (x+15, y-15), cv2.FONT_HERSHEY_SIMPLEX, 0.8, thickness = 2, color=(255, 0, 0))
        region['counts'] = 0
    # pil_image = Image.fromarray(frame) 
    # pil_image.show()
    out.write(frame)
    return


In [7]:
cnt = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    run_model(frame)
    if cv2.waitKey(1) == ord("q"):
        break
    cnt += 1
    # if cnt == 10:
    #     break
cap.release()
out.release()
cv2.destroyAllWindows()

print("Video processing complete. Output saved at:", output_path)




0: 384x640 9 cars, 1 van, 2 trucks, 9.2ms
Speed: 1.3ms preprocess, 9.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 2 trucks, 9.1ms
Speed: 1.1ms preprocess, 9.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 1 van, 2 trucks, 9.1ms
Speed: 1.2ms preprocess, 9.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 1 van, 2 trucks, 9.1ms
Speed: 1.1ms preprocess, 9.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 1 van, 2 trucks, 9.1ms
Speed: 1.2ms preprocess, 9.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 1 van, 2 trucks, 9.1ms
Speed: 1.1ms preprocess, 9.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 cars, 1 van, 2 trucks, 9.1ms
Speed: 1.2ms preprocess, 9.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 cars, 2 trucks, 9.1m