In [None]:
!pip install -qU ultralytics

## Initialize SAM Model

In [1]:
from ultralytics import SAM
import matplotlib.pyplot as plt

# load the model
model = SAM('sam2.1_b.pt')

# display model info
model.info()

Model summary: 403 layers, 80,850,178 parameters, 80,850,178 gradients


(403, 80850178, 80850178, 0.0)

In [2]:
# url - https://ultralytics.com/images/bus.jpg

## Segment Image

In [2]:
# define bounding box regions
bboxes = [[55, 400, 230, 900]]

image_path = 'test_image.jpg'
results = model(image_path, bboxes=bboxes)


image 1/1 D:\notebooks\temp projects\youtube\Image & Video Segmentation using SAM2.1\test_image.jpg: 1024x1024 1 0, 340.4ms
Speed: 34.1ms preprocess, 340.4ms inference, 13.1ms postprocess per image at shape (1, 3, 1024, 1024)


In [10]:
for result in results:
    result.show()

In [11]:
# define single points
points = [[350, 370]]
results = model(image_path, points=points, labels=[1])


image 1/1 D:\notebooks\temp projects\youtube\Image & Video Segmentation using SAM2.1\test_image.jpg: 1024x1024 1 0, 388.6ms
Speed: 8.1ms preprocess, 388.6ms inference, 0.4ms postprocess per image at shape (1, 3, 1024, 1024)


In [12]:
for result in results:
    result.show()

In [3]:
# define multiple points
points = [[350, 370], [100, 650]]
results = model(image_path, points=points, labels=[1, 0])


image 1/1 D:\notebooks\temp projects\youtube\Image & Video Segmentation using SAM2.1\test_image.jpg: 1024x1024 1 0, 1 1, 319.8ms
Speed: 12.7ms preprocess, 319.8ms inference, 0.4ms postprocess per image at shape (1, 3, 1024, 1024)


In [14]:
for result in results:
    result.show()

## Extract BBox Image from the Original Image

In [4]:
import cv2
import torch
import numpy as np

In [7]:
result.boxes.xyxy

tensor([[ 17., 232., 800., 726.],
        [ 57., 401., 205., 896.]], device='cuda:0')

In [6]:
image = cv2.imread(image_path)

for i, result in enumerate(results):
    if hasattr(result, 'boxes') and result.boxes is not None:
        boxes = result.boxes.xyxy.cpu().numpy() if isinstance(result.boxes.xyxy, torch.Tensor) else np.array(result.boxes.xyxy)

        # iterate through the bounding boxes
        for j, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box[:4])

            cropped_img = image[y1:y2, x1: x2]

            # show the image
            cv2.imshow(f"Cropped Image {i}_{j}", cropped_img)
            cv2.waitKey(0)

cv2.destroyAllWindows()

## Segment Video

In [8]:
from ultralytics.models.sam import SAM2VideoPredictor

# define model parameters
overrides = dict(conf=0.25, task='segment', mode='predict', imgsz=1024, model='sam2.1_b.pt')

predictor = SAM2VideoPredictor(overrides=overrides)

In [9]:
video_path = 'test_video.mp4'

results = predictor(source=video_path, points=[900, 820], labels=[1])


Ultralytics 8.3.91  Python-3.12.3 torch-2.5.1 CUDA:0 (NVIDIA GeForce RTX 4070, 12282MiB)

errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/58) D:\notebooks\temp projects\youtube\Image & Video Segmentation using SAM2.1\test_video.mp4: 1024x1024 1 0, 178.3ms
video 1/1 (frame 2/58) D:\notebooks\temp projects\youtube\Image & Video Segmentation using SAM2.1\test_video.mp4: 1024x1024 1 0, 133.1ms
video 1/1 (frame 3/58) D:\notebooks\temp projects\youtube\Image & Video Segmentation using SAM2.1\test_video.mp4: 1024x1024 1 0, 120.7ms
video 1/1 (frame 4/58) D:\notebooks\temp projects\youtube\Image & Video 