In [6]:
from ultralytics import YOLO
import torch
import cv2
import numpy as np
from filterpy.kalman import KalmanFilter
from PIL import Image

# Load the YOLOv8 model (pre-trained on COCO dataset)
model = YOLO("yolov8x.pt")  # You can use yolov8s, yolov8m, or yolov8l for better accuracy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define input and output video
video_path = "video.mp4"
output_path = "output_kaman_filter.avi"

# Set up video capture
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define the output video writer
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))




In [7]:
def KalManInitialize():
    kf = KalmanFilter(dim_x=4, dim_z=2)
    # kf.x = np.array([0, 0, 0, 0])
    dt = 1 / 30  
    kf.F = np.array([[1, 0, dt, 0],
                    [0, 1, 0, dt],
                    [0, 0, 1, 0],
                    [0, 0, 0, 1]])
    kf.H = np.array([[1, 0, 0, 0],
                    [0, 1, 0, 0]])
    np.eye(2, dtype=np.float32) * 0.5
    kf.Q = np.eye(4, dtype=np.float32) * 0.03 
    return kf

In [8]:
def to_tensor(frame):
    # Convert the frame (BGR) to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Convert to tensor and normalize to [0, 1] range
    frame_tensor = torch.tensor(frame_rgb).permute(2, 0, 1).unsqueeze(0).float() / 255.0
    
    # Convert the tensor to uint8 (0-255 range)
    frame_tensor_uint8 = (frame_tensor * 255).clamp(0, 255).byte()
    
    return frame_tensor_uint8

def run_model(frame) -> tuple:
    CONFIDENCE_THRESHOLD_LIMIT = 0.3
    model.to(device)
    batch = [frame]
    centroid_x = None
    centroid_y = None
    with torch.no_grad():
        result = model(batch, device=device)[0]

    bboxes = np.array(result.boxes.xyxy.cpu(), dtype="int")
    classes = np.array(result.boxes.cls.cpu(), dtype="int")
    confidence = np.array(result.boxes.conf.cpu(), dtype="float")
    BOX_COLOUR = (37, 245, 75)
    for cls, bbox, conf in zip(classes, bboxes, confidence):
        (x, y, x2, y2) = bbox
        object_name = model.names[cls]
        if conf < CONFIDENCE_THRESHOLD_LIMIT:
            continue
        if object_name != "truck" and object_name != "car": 
            continue
        centroid_x = (x + x2) // 2
        centroid_y = (y + y2) // 2
        cv2.circle(frame, (centroid_x, centroid_y), 5, BOX_COLOUR, -1)
        cv2.rectangle(frame, (x, y), (x2, y2), BOX_COLOUR, 2)

    return centroid_x, centroid_y 

In [10]:
kf = KalManInitialize()

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    if kf.x is not None:
        predicted = kf.predict()
        predicted_x = int(kf.x[0])
        predicted_y = int(kf.x[1])
        predicted_dx, predicted_dy = kf.x[2:]  # Predicted velocity
        print(predicted_x, predicted_y)
        print(f"Predicted velocity: (dx: {predicted_dx}, dy: {predicted_dy})")

    updated_frame = run_model(frame)
    
    
    if updated_frame != (None, None):
        measured_x, measured_y = updated_frame
        kf.update(np.array([[np.float32(measured_x)], [np.float32(measured_y)]]))
        cv2.circle(frame, (measured_x, measured_y), 6, (0, 255, 0), 2)
    cv2.circle(frame, (predicted_x, predicted_y), 8, (0, 0, 255), 2) 
    out.write(frame)
    if cv2.waitKey(1) == ord("q"):
        break
cap.release()
out.release()
cv2.destroyAllWindows()

print("Video processing complete. Output saved at:", output_path)

  predicted_x = int(kf.x[0])
  predicted_y = int(kf.x[1])


0 0
Predicted velocity: (dx: [          0], dy: [          0])

0: 640x384 1 car, 1 truck, 9.2ms
Speed: 2.3ms preprocess, 9.2ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)
186 411
Predicted velocity: (dx: [     6.0066], dy: [     13.293])

0: 640x384 1 car, 1 truck, 9.0ms
Speed: 1.3ms preprocess, 9.0ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)
249 551
Predicted velocity: (dx: [     11.936], dy: [     26.383])

0: 640x384 1 car, 1 truck, 8.9ms
Speed: 1.2ms preprocess, 8.9ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)
282 625
Predicted velocity: (dx: [     17.676], dy: [     39.203])

0: 640x384 1 car, 1 truck, 8.9ms
Speed: 1.2ms preprocess, 8.9ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)
303 670
Predicted velocity: (dx: [     23.184], dy: [     51.228])

0: 640x384 1 car, 8.9ms
Speed: 1.2ms preprocess, 8.9ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)
317 703
Predicted velocity: 