<a href="https://colab.research.google.com/github/vnot01/object-detection-using-yolo-and-sam2/blob/main/YOLO%2BSAM2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/AI_Class/PROJECT_PUB
!git clone https://github.com/facebookresearch/sam2.git
%cd sam2
!pip install -e .

%cd /content/drive/MyDrive/PROJECT_PUB/sam2/checkpoints
!chmod +x ./download_ckpts.sh

!./download_ckpts.sh
%cd ..
%cd /content/drive/MyDrive/PROJECT_PUB/sam2

In [None]:
import torch
from sam2.build_sam import build_sam2_video_predictor

checkpoint = "./checkpoints/sam2.1_hiera_tiny.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_t.yaml"
predictor = build_sam2_video_predictor(model_cfg, checkpoint)

In [None]:
def show_mask(mask, ax, obj_id=None, random_color=False):
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        cmap = plt.get_cmap("tab10")
        cmap_idx = 0 if obj_id is None else obj_id
        color = np.array([*cmap(cmap_idx)[:3], 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)


def show_points(coords, labels, ax, marker_size=200):
    pos_points = coords[labels==1]
    neg_points = coords[labels==0]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)


def show_box(box, ax):
    x0, y0 = box[0], box[1]
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))

In [None]:
import os
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
# turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
if torch.cuda.get_device_properties(0).major >= 8:
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

In [None]:
#%cd /content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/Video
from supervision.assets import download_assets, VideoAssets
import supervision as sv
SOURCE_VIDEO = download_assets(VideoAssets.BASKETBALL)
#SOURCE_VIDEO = '/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/Video/video70.mp4'
sv.VideoInfo.from_video_path(SOURCE_VIDEO)

**Generating the images for the first time from the given videos**

In [None]:
SCALE_FACTOR = 0.5
START_IDX = 0
END_IDX = 476

from pathlib import Path
#%cd /content/drive/MyDrive/AI_Class/PROJECT_PUB/sam2/custom_dataset
#SOURCE_FRAMES = Path(HOME) / Path(SOURCE_VIDEO).stem
SOURCE_FRAMES = Path('/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/images1')
SOURCE_FRAMES.mkdir(parents=True, exist_ok=True)
SOURCE_VIDEO ='/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/Video/video70.mp4'
frames_generator = sv.get_video_frames_generator(SOURCE_VIDEO, start_idx = START_IDX, end_idx = END_IDX)
images_sink = sv.ImageSink(
    target_dir_path=SOURCE_FRAMES.as_posix(),
    overwrite=True,
    image_name_pattern="{:05d}.jpeg"
)

with images_sink:
    for frame in frames_generator:
        frame = sv.scale_image(frame, SCALE_FACTOR)
        images_sink.save_image(frame)

TARGET_VIDEO = f"{Path(SOURCE_VIDEO).stem}-result.mp4"
SOURCE_FRAME_PATHS = sorted(sv.list_files_with_extensions(SOURCE_FRAMES.as_posix(), extensions=["jpeg"]))

If the images is already geneated we can use the followind code

In [None]:
from pathlib import Path
SOURCE_FRAMES = Path('/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/images')
#SOURCE_FRAMES.mkdir(parents=True, exist_ok=True)
SOURCE_VIDEO ='/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/Video/basketball-1.mp4'
TARGET_VIDEO = f"{Path('/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/masked_video').stem}-masked.mp4"
SOURCE_FRAME_PATHS = sorted(sv.list_files_with_extensions(SOURCE_FRAMES.as_posix(), extensions=["jpeg"]))

In [None]:
# The variable VIDEO_FRAMES_DIRECTORY_PATH should be a string
VIDEO_FRAMES_DIRECTORY_PATH = '/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/images1'
inference_state = predictor.init_state(VIDEO_FRAMES_DIRECTORY_PATH)

In [None]:
predictor.reset_state(inference_state)

**Now time to prompt on the images. let see easy prompting way using UI**

In [None]:
import cv2
import torch
import base64

import numpy as np
import supervision as sv

from pathlib import Path
from supervision.assets import download_assets, VideoAssets
from sam2.build_sam import build_sam2_video_predictor

IS_COLAB = True

if IS_COLAB:
    from google.colab import output
    output.enable_custom_widget_manager()

from jupyter_bbox_widget import BBoxWidget

In [None]:
def encode_image(filepath):
    with open(filepath, 'rb') as f:
        image_bytes = f.read()
    encoded = str(base64.b64encode(image_bytes), 'utf-8')
    return "data:image/jpg;base64,"+encoded

**NOTE:** SAM2 allows tracking multiple objects at once. Update the `OBJECTS` list if you want to change the list of tracked objects.

In [None]:
OBJECTS = ['ball', 'player-1', 'player-2']


**NOTE:** Let's choose the index of the reference frame that we will use to annotate the objects we are looking for.

In [None]:
FRAME_IDX = 100
SOURCE_FRAMES = '/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/images'
FRAME_PATH = Path(SOURCE_FRAMES) / f"{FRAME_IDX:05d}.jpeg"

widget = BBoxWidget(classes=OBJECTS)
widget.image = encode_image(FRAME_PATH)
widget

In [None]:
widget.bboxes

**NOTE:** The widget we are using stores annotations in a format that is inconsistent with SAM2's requirements. We parse them and then pass them to SAM2 via the `add_new_points` method. Each of the objects we track must be passed via a separate `add_new_points` call. It is important to specify `frame_idx` each time - the index of the frame to which the annotations relate, and `obj_id` - the ID of the object to which the annotations relate.

In [None]:
default_box = [
    {'x': 705, 'y': 302, 'width': 0, 'height': 0, 'label': 'ball'},
    {'x': 587, 'y': 300, 'width': 0, 'height': 0, 'label': 'player-1'},
    {'x': 753, 'y': 267, 'width': 0, 'height': 0, 'label': 'player-2'}
]

boxes = widget.bboxes if widget.bboxes else default_box

for object_id, label in enumerate(OBJECTS, start=1):
    boxes = [box for box in widget.bboxes if box['label'] == label]

    if len(boxes) == 0:
        continue

    points = np.array([
        [
            box['x'],
            box['y']
        ] for box in boxes
    ], dtype=np.float32)
    labels = np.ones(len(points))

    _, object_ids, mask_logits = predictor.add_new_points(
        inference_state=inference_state,
        frame_idx=FRAME_IDX,
        obj_id=object_id,
        points=points,
        labels=labels,
    )

### Video inference

**NOTE:** To apply our point prompts to all video frames, we use the `propagate_in_video` generator. Each call returns `frame_idx` - the index of the current frame, `object_ids` - IDs of objects detected in the frame, and `mask_logits` - corresponding `object_ids` logit values, which we can convert to masks using thresholding.

*Here sam2 will go through the given video and save the masked images in the specified folder *

In [None]:
video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO)
video_info.width = int(video_info.width * SCALE_FACTOR)
video_info.height = int(video_info.height * SCALE_FACTOR)
annotated_frames_dir = '/content/drive/MyDrive/AI_Class/PROJECT_PUB/sam2/custom_dataset/Masked_images'
COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700']
mask_annotator = sv.MaskAnnotator(
    color=sv.ColorPalette.from_hex(COLORS),
    color_lookup=sv.ColorLookup.CLASS)

frame_sample = []
frame_paths = []
with sv.VideoSink(Path(TARGET_VIDEO).as_posix(), video_info=video_info) as sink:
    for frame_idx, object_ids, mask_logits in predictor.propagate_in_video(inference_state):
        frame_path = SOURCE_FRAME_PATHS[frame_idx]
        frame_paths.append(frame_path)
        frame = cv2.imread(frame_path)
        masks = (mask_logits > 0.0).cpu().numpy()
        masks = np.squeeze(masks).astype(bool)

        detections = sv.Detections(
            xyxy=sv.mask_to_xyxy(masks=masks),
            mask=masks,
            class_id=np.array(object_ids)
        )

        annotated_frame = mask_annotator.annotate(scene=frame.copy(), detections=detections)

        sink.write_frame(annotated_frame)
        #saving specific annotated frames
        #if frame_idx % video_info.fps == 0:
        #   frame_sample.append(annotated_frame)
        #saving all masked frames
        frame_sample.append(annotated_frame)

Now time to integrate the YOLO model and sam2 model. here we will get the bounding box of the objects from yolo for each images the sam2 try to sagment the objects based on the given boxes.

In [None]:
import cv2
import os
from ultralytics import YOLO

def extract_detection_info(frame_folder, model_path="yolov8s.pt"):
    """
    Extract detection information using YOLO model without visualization

    Args:
        frame_folder (str): Path to folder containing image frames
        model_path (str): Path to YOLO model weights

    Returns:
        list: List of dictionaries containing detection information
    """
    # Initialize YOLO model
    yolo = YOLO(model_path)

    all_detections = []  # Store all detections

    # Process each frame
    for frame_file in os.listdir(frame_folder):
        if not frame_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        frame_path = os.path.join(frame_folder, frame_file)
        frame_id = os.path.splitext(frame_file)[0]

        # Load frame
        frame = cv2.imread(frame_path)
        if frame is None:
            print(f"Could not load frame: {frame_path}")
            continue

        # Perform detection
        results = yolo(frame)

        # Process each detection
        for result in results:
            boxes = result.boxes  # Boxes object for bbox outputs
            for box in boxes:
                # Get box coordinates
                bbox = box.xyxy[0].cpu().numpy()  # get box coordinates in (x1, y1, x2, y2) format

                # Get class information
                cls = int(box.cls[0].item())  # class id
                conf = float(box.conf[0].item())  # confidence score

                # Store detection information
                detection_info = {
                    "frame_id": frame_id,
                    "object_id": cls,
                    "bbox": bbox.tolist(),  # convert to list for easier handling
                    "confidence": conf
                }

                all_detections.append(detection_info)

    return all_detections

# Usage example
if __name__ == "__main__":
    frame_folder = "/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/images"
    detections = extract_detection_info(frame_folder, "yolov10s.pt")

    # Print all detections
    print(f"\nTotal number of detections across all frames: {len(detections)}")
    for detection in detections:
        print("\nDetection:")
        print(detection)

        boxes = [detection['bbox'][0],detection['bbox'][1],detection['bbox'][0] + detection['bbox'][2],detection['bbox'][1]+ detection['bbox'][3]]
        #for i in range(len(boxes)):
        # Pass input_boxes instead of boxes
        _, object_ids, mask_logits = predictor.add_new_points_or_box(
            inference_state=inference_state,
            frame_idx=int(detection['frame_id']),
            #points=points,
            #labels=labels,
            obj_id=detection['object_id'],
            box=detection['bbox'],  # Pass the bounding boxes as input_boxes
        )



problem of this method is with in a given image if we have the same object id/class id for 2 different but the same class object, sam2 will mask one of them. here we have different bounding box but the same object id/ class id which is a problem for the sam2. because same need unique object id to track and mask the object

In [None]:
import cv2
import os
from ultralytics import YOLO

def extract_detection_info(frame_folder, model_path="yolov8s.pt"):
    """
    Extract detection information using YOLO model without visualization

    Args:
        frame_folder (str): Path to folder containing image frames
        model_path (str): Path to YOLO model weights

    Returns:
        list: List of dictionaries containing detection information
    """
    # Initialize YOLO model
    yolo = YOLO(model_path)

    all_detections = []  # Store all detections

    # Process each frame
    for frame_file in os.listdir(frame_folder):
        if not frame_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        frame_path = os.path.join(frame_folder, frame_file)
        frame_id = os.path.splitext(frame_file)[0]

        # Load frame
        frame = cv2.imread(frame_path)
        if frame is None:
            print(f"Could not load frame: {frame_path}")
            continue

        # Perform detection
        results = yolo(frame)

        # Process each detection
        for result in results:
            boxes = result.boxes  # Boxes object for bbox outputs
            for box in boxes:
                # Get box coordinates
                bbox = box.xyxy[0].cpu().numpy()  # get box coordinates in (x1, y1, x2, y2) format

                # Get class information
                cls = int(box.cls[0].item())  # class id
                conf = float(box.conf[0].item())  # confidence score

                # Store detection information
                detection_info = {
                    "frame_id": frame_id,
                    "object_id": cls,
                    "bbox": bbox.tolist(),  # convert to list for easier handling
                    "confidence": conf
                }

                all_detections.append(detection_info)

    return all_detections

# Usage example
if __name__ == "__main__":
    frame_folder = "/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/images"
    detections = extract_detection_info(frame_folder, "yolov10s.pt")

    # Print all detections
    print(f"\nTotal number of detections across all frames: {len(detections)}")
    for detection in detections:
        print("\nDetection:")
        print(detection)

        boxes = [detection['bbox'][0],detection['bbox'][1],detection['bbox'][0] + detection['bbox'][2],detection['bbox'][1]+ detection['bbox'][3]]
        #for i in range(len(boxes)):
        # Pass input_boxes instead of boxes
        _, object_ids, mask_logits = predictor.add_new_points_or_box(
            inference_state=inference_state,
            frame_idx=int(detection['frame_id']),
            #points=points,
            #labels=labels,
            obj_id=detection['object_id'],
            box=detection['bbox'],  # Pass the bounding boxes as input_boxes
        )



In [None]:
SCALE_FACTOR = 0.5
SOURCE_VIDEO = Path('/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/Video/basketball-1.mp4')
TARGET_VIDEO = Path(f"{Path(SOURCE_VIDEO).stem}-boxInput.mp4")
SOURCE_FRAMES = Path('/content/drive/MyDrive/PROJECT_PUB/sam2/custom_dataset/images')
SOURCE_FRAME_PATHS = sorted(sv.list_files_with_extensions(SOURCE_FRAMES.as_posix(), extensions=["jpeg"]))

import cv2
video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO)
video_info.width = int(video_info.width * SCALE_FACTOR)
video_info.height = int(video_info.height * SCALE_FACTOR)

COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700']
mask_annotator = sv.MaskAnnotator(
    color=sv.ColorPalette.from_hex(COLORS),
    color_lookup=sv.ColorLookup.CLASS)

frame_sample = []

with sv.VideoSink(TARGET_VIDEO.as_posix(), video_info=video_info) as sink:
    for frame_idx, object_ids, mask_logits in predictor.propagate_in_video(inference_state):
        frame_path = SOURCE_FRAME_PATHS[frame_idx]
        frame = cv2.imread(frame_path)
        masks = (mask_logits > 0.0).cpu().numpy()
        masks = np.squeeze(masks).astype(bool)

        detections = sv.Detections(
            xyxy=sv.mask_to_xyxy(masks=masks),
            mask=masks,
            class_id=np.array(object_ids)
        )

        annotated_frame = mask_annotator.annotate(scene=frame.copy(), detections=detections)

        sink.write_frame(annotated_frame)
        if frame_idx % video_info.fps == 0:
            frame_sample.append(annotated_frame)

now we will going to solve the problem. what we have to do here is, we have to give unique id if 2 or more objects of the same class appeared in the image before we feed to the sam points to be tracked in the video.

In [None]:
import cv2
import os
from ultralytics import YOLO
from collections import defaultdict

def extract_detection_info(frame_folder, model_path="yolov10s.pt"):
    """
    Extract detection information using YOLO model with tracking

    Args:
        frame_folder (str): Path to folder containing image frames
        model_path (str): Path to YOLO model weights

    Returns:
        list: List of dictionaries containing detection information with track IDs
    """
    # Initialize YOLO model with tracking
    yolo = YOLO(model_path)

    all_detections = []  # Store all detections

    # Process each frame
    for frame_file in sorted(os.listdir(frame_folder)):  # Sort to ensure consistent frame order
        if not frame_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        frame_path = os.path.join(frame_folder, frame_file)
        frame_id = os.path.splitext(frame_file)[0]

        # Load frame
        frame = cv2.imread(frame_path)
        if frame is None:
            print(f"Could not load frame: {frame_path}")
            continue

        # Perform detection with tracking enabled
        results = yolo.track(frame, persist=True, tracker="botsort.yaml")  # Enable tracking

        if results and len(results) > 0:
            # Process each detection
            for result in results:
                if not hasattr(result, 'boxes'):
                    continue

                boxes = result.boxes
                for box in boxes:
                    # Get box coordinates
                    bbox = box.xyxy[0].cpu().numpy()

                    # Get class information
                    cls = int(box.cls[0].item())
                    conf = float(box.conf[0].item())

                    # Get track ID (if available)
                    track_id = None
                    if hasattr(box, 'id'):
                        track_id = int(box.id.item())

                    # Store detection information
                    detection_info = {
                        "frame_id": frame_id,
                        "class_id": cls,
                        "track_id": track_id,  # This will be unique for each instance
                        "bbox": bbox.tolist(),
                        "confidence": conf
                    }

                    all_detections.append(detection_info)

    return all_detections

# Usage example
if __name__ == "__main__":
    frame_folder = "/content/drive/MyDrive/AI_Class/PROJECT_PUB/sam2/custom_dataset/images"
    detections = extract_detection_info(frame_folder, "yolov10s.pt")

    # Print all detections
    print(f"\nTotal number of detections across all frames: {len(detections)}")

    # Group detections by frame to see multiple instances in each frame
    frame_detections = defaultdict(list)
    for detection in detections:
        frame_detections[detection['frame_id']].append(detection)

    # Print detections grouped by frame
    for frame_id, frame_dets in frame_detections.items():
        print(f"\nFrame {frame_id}:")
        for det in frame_dets:
            print(f"  Class {det['class_id']}, Track ID {det['track_id']}, Confidence {det['confidence']:.2f}")

        # Use with predictor
        for detection in frame_dets:
            boxes = detection['bbox']

            # Use track_id as the object identifier if available, otherwise use class_id
            obj_id = detection['track_id'] if detection['track_id'] is not None else detection['class_id']

            _, object_ids, mask_logits = predictor.add_new_points_or_box(
                inference_state=inference_state,
                frame_idx=int(detection['frame_id']),
                obj_id=obj_id,
                box=boxes,
            )

In [None]:
import os
video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO)
video_info.width = int(video_info.width * SCALE_FACTOR)
video_info.height = int(video_info.height * SCALE_FACTOR)
annotated_frames_dir = '/content/drive/MyDrive/AI_Class/PROJECT_PUB/sam2/custom_dataset/Masked_images'
COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700']
mask_annotator = sv.MaskAnnotator(
    color=sv.ColorPalette.from_hex(COLORS),
    color_lookup=sv.ColorLookup.CLASS)

frame_sample = []
frame_paths = []
# Define the class mapping
# class_mapping = {
#     1: 'ball',
#     2: 'player-1',
#     3: 'player-2'
#     # Add more mappings if necessary
#}
with sv.VideoSink(Path(TARGET_VIDEO).as_posix(), video_info=video_info) as sink:
    for frame_idx, object_ids, mask_logits in predictor.propagate_in_video(inference_state):
        frame_path = SOURCE_FRAME_PATHS[frame_idx]
        frame_paths.append(frame_path)
        frame = cv2.imread(frame_path)
        masks = (mask_logits > 0.0).cpu().numpy()
        masks = np.squeeze(masks).astype(bool)

        detections = sv.Detections(
            xyxy=sv.mask_to_xyxy(masks=masks),
            mask=masks,
            class_id=np.array(object_ids)
        )

        #Get class names for each detection
        #class_names = [class_mapping[id] for id, mask in zip(detections.class_id, detections.mask) if np.any(mask)]

        # Print or store class names as needed
        #print(f"Frame {frame_idx}: {class_names}")

        annotated_frame = mask_annotator.annotate(scene=frame.copy(), detections=detections)
        # Save the annotated frame to the specified directory
        frame_path = os.path.join(annotated_frames_dir, f"{frame_idx:05d}.jpg")  # Example filename
        cv2.imwrite(frame_path, annotated_frame)
        frame_paths.append(frame_path)

         # Write the annotated frame to the video sink

        sink.write_frame(annotated_frame)
        #saving specific annotated frames
        #if frame_idx % video_info.fps == 0:
        #   frame_sample.append(annotated_frame)
        #saving all masked frames
        frame_sample.append(annotated_frame)


Visualize the result

In [None]:
sv.plot_images_grid(
    images=frame_sample[:20],
    grid_size=(5, 5)
)