<a href="https://colab.research.google.com/github/thegregbeyond/FreeFuse-AI-Calbright-Project/blob/main/Object_Detection_%26_Tracking_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **A) Combined YOLO + DeepSORT**



In [4]:
# A) Combine YOLO + DeepSORT
# === Install required packages (run once) ===
!pip install ultralytics deep-sort-realtime opencv-python pandas

from google.colab import drive
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import cv2
import os
import json
import pandas as pd
from pathlib import Path
import numpy as np

# === Parameters ===
VIDEO_FOLDER         = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Input")
OUTPUT_FOLDER        = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Output")
CONFIDENCE_THRESHOLD = 0.5
DETECTION_INTERVAL   = 5     # analyze every Nth frame
MAX_TRACK_AGE        = 30     # frames to keep a lost track
MIN_HITS             = 3      # detections before confirming a track

# drawing settings
MASK_COLOR           = (0, 255, 0)    # BGR mask outline color
MASK_THICKNESS       = 2              # mask polygon line thickness
TEXT_COLOR           = (255, 255, 255)# BGR text color
TEXT_FONT            = cv2.FONT_HERSHEY_SIMPLEX
TEXT_SCALE           = 0.6
TEXT_THICKNESS       = 2

# === 1) Mount Google Drive ===
drive.mount('/content/drive')

# === 2) Load YOLOv8-nano segmentation & DeepSORT ===
model   = YOLO('yolov8n-seg')           # auto-downloads nano-segmentation weights
tracker = DeepSort(max_age=MAX_TRACK_AGE, n_init=MIN_HITS)

# utility to compute IoU between two boxes

def compute_iou(boxA, boxB):
    xA1,yA1,xA2,yA2 = boxA
    xB1,yB1,xB2,yB2 = boxB
    xi1, yi1 = max(xA1,xB1), max(yA1,yB1)
    xi2, yi2 = min(xA2,xB2), min(yA2,yB2)
    inter = max(0, xi2-xi1) * max(0, yi2-yi1)
    union = (xA2-xA1)*(yA2-yA1) + (xB2-xB1)*(yB2-yB1) - inter
    return inter/union if union>0 else 0

annotations = []

# ensure output CSV and video folder exist
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

for video_file in os.listdir(VIDEO_FOLDER):
    if not video_file.lower().endswith(('.mp4','.mov','.avi')):
        continue

    cap        = cv2.VideoCapture(str(VIDEO_FOLDER/video_file))
    fps        = cap.get(cv2.CAP_PROP_FPS)
    width      = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height     = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_num  = 0
    video_name = Path(video_file).stem

    # prepare video writer
    output_path = OUTPUT_FOLDER / video_file
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_num % DETECTION_INTERVAL == 0:
            timestamp_sec = int(frame_num / fps)
            frame_id      = f"{video_name}_{timestamp_sec:04d}"

            # YOLOv8 segmentation inference
            results = model(frame)[0]

            dets_for_tracker = []
            det_meta = []
            for idx, (box, score, cls) in enumerate(zip(
                    results.boxes.xyxy, results.boxes.conf, results.boxes.cls)):
                conf = float(score)
                if conf < CONFIDENCE_THRESHOLD:
                    continue

                x1,y1,x2,y2 = box.cpu().numpy().astype(int)
                cls_id      = int(cls.cpu().numpy())
                name        = model.names[cls_id]

                # extract polygon in original image scale
                # YOLOv8 provides masks.xy which are already scaled
                poly = np.array(results.masks.xy[idx], dtype=np.int32)
                # ensure shape (-1,2)
                poly = poly.reshape(-1,2)

                dets_for_tracker.append([[x1,y1,x2-x1,y2-y1], conf, name])
                det_meta.append({
                    "bbox": (x1,y1,x2,y2),
                    "MID": f"/m/{cls_id:07d}",
                    "object_name": name,
                    "object_category": "unknown",
                    "mask_poly": poly.tolist(),
                    "confidence": conf,
                })

            # update tracker
            tracks = tracker.update_tracks(dets_for_tracker, frame=frame)

            if det_meta:
                for trk in tracks:
                    if not trk.is_confirmed():
                        continue
                    tx1,ty1,tx2,ty2 = trk.to_tlbr()
                    track_id = trk.track_id

                    # match detection by IoU
                    best_iou, best = max(
                        ((compute_iou((tx1,ty1,tx2,ty2), m["bbox"]), m) for m in det_meta),
                        key=lambda x: x[0]
                    )
                    if best_iou > 0.3:
                        # draw mask outline using original-scale polygon
                        pts = np.array(best["mask_poly"], np.int32)
                        if pts.size:
                            cv2.polylines(frame, [pts], isClosed=True, color=MASK_COLOR, thickness=MASK_THICKNESS)
                            # place label at first vertex
                            label_pos = tuple(pts[0])
                            cv2.putText(frame, best["object_name"], label_pos, TEXT_FONT,
                                        TEXT_SCALE, TEXT_COLOR, TEXT_THICKNESS, cv2.LINE_AA)

                        # record annotation
                        annotations.append({
                            "video_filename":    video_file,
                            "frame_id":          frame_id,
                            "track_id":          f"{video_name}_{track_id}",
                            "object_id":         f"{frame_id}_obj{track_id}",
                            "timestamp_sec":     timestamp_sec,
                            "image_width_px":    width,
                            "image_height_px":   height,
                            "MID":               best["MID"],
                            "object_name":       best["object_name"],
                            "object_category":   best["object_category"],
                            "x_min":             int(tx1),
                            "y_min":             int(ty1),
                            "x_max":             int(tx2),
                            "y_max":             int(ty2),
                            "segmentation_mask": json.dumps([best["mask_poly"]]),
                            "confidence":        best["confidence"],
                            "interaction_score": 0.0
                        })

        # write frame (with masks) to output
        writer.write(frame)
        frame_num += 1

    cap.release()
    writer.release()

# write CSV of annotations
out_csv = OUTPUT_FOLDER / "draft_annotations.csv"
pd.DataFrame(annotations).to_csv(out_csv, index=False)
print(f"Saved annotated video(s) to {OUTPUT_FOLDER}")
print(f"Saved annotations to {out_csv}")

Mounted at /content/drive
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n-seg.pt to 'yolov8n-seg.pt'...


100%|██████████| 6.74M/6.74M [00:00<00:00, 107MB/s]



0: 384x640 (no detections), 420.7ms
Speed: 11.7ms preprocess, 420.7ms inference, 9.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 252.3ms
Speed: 4.6ms preprocess, 252.3ms inference, 30.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 motorcycle, 186.9ms
Speed: 4.8ms preprocess, 186.9ms inference, 18.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 200.1ms
Speed: 4.6ms preprocess, 200.1ms inference, 7.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 191.5ms
Speed: 5.4ms preprocess, 191.5ms inference, 14.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 210.4ms
Speed: 4.2ms preprocess, 210.4ms inference, 12.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 263.5ms
Speed: 7.4ms preprocess, 263.5ms inference, 19.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 307.1ms
Speed: 4.3ms preprocess, 307.1ms inferenc