In [3]:
# from camera
import cv2

import ipywidgets.widgets as widgets
from IPython.display import display
image_display = widgets.Image(format="jpeg", width="45%")
full_display = widgets.Image(format="jpeg", width="45%")
display(widgets.HBox([image_display, full_display]))

location_coords_display = widgets.Label()
display(location_coords_display)

import pyzed.sl as sl
camera = sl.Camera()
camera_params = sl.InitParameters()
camera_params.camera_resolution = sl.RESOLUTION.VGA
camera_params.depth_mode = sl.DEPTH_MODE.ULTRA
camera_params.coordinate_units = sl.UNIT.MILLIMETER

camera_status = camera.open(camera_params)
if camera_status != sl.ERROR_CODE.SUCCESS:
    print("camera error")
    print(camera_status)
    camera.close()
    exit()

# initialize model
import track_object
m = track_object.Model()

# get initial image and choose object to track
image_mat = sl.Mat()
started_tracking = False
while not started_tracking:
    err = camera.grab()
    if err == sl.ERROR_CODE.SUCCESS:
        camera.retrieve_image(image_mat)
        image = image_mat.get_data()
        
        image_display.value = m.show_all_boxes(image)
    
        user_input = input("enter id to track (or leave blank to skip):")
        if user_input == "":
            continue
        else:
            m.tracked_id = int(user_input)
            print("tracking object id " + str(user_input))
            started_tracking = True

# start tracking
running = True
while running:
    err = camera.grab()
    if err == sl.ERROR_CODE.SUCCESS:
        camera.retrieve_image(image_mat)
        image = image_mat.get_data()

        tracked_box = m.track(image)
        if tracked_box is False:
            image_display.value = bytes(cv2.imencode('.jpg', image)[1])
        else:
            image_rect = cv2.rectangle(
                image,
                (int(tracked_box[0]), int(tracked_box[1])),
                (int(tracked_box[2]), int(tracked_box[3])),
                (255, 0, 0),
                4
            )
            image_display.value = bytes(cv2.imencode('.jpg', image_rect)[1])

        full_display.value = m.show_all_boxes(image)
    

camera.close()

HBox(children=(Image(value=b'', format='jpeg', width='45%'), Image(value=b'', format='jpeg', width='45%')))

Label(value='')

[2025-03-26 10:11:49 UTC][ZED][INFO] Logging level INFO
[2025-03-26 10:11:49 UTC][ZED][INFO] Logging level INFO
[2025-03-26 10:11:50 UTC][ZED][INFO] Logging level INFO
[2025-03-26 10:11:50 UTC][ZED][INFO] [Init]  Depth mode: ULTRA
[2025-03-26 10:11:51 UTC][ZED][INFO] [Init]  Camera successfully opened.
[2025-03-26 10:11:51 UTC][ZED][INFO] [Init]  Camera FW version: 1523
[2025-03-26 10:11:51 UTC][ZED][INFO] [Init]  Video mode: VGA@100
[2025-03-26 10:11:51 UTC][ZED][INFO] [Init]  Serial Number: S/N 34032459
Loading yolo11l_half.engine for TensorRT inference...
[03/26/2025-10:11:52] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
[03/26/2025-10:11:52] [TRT] [I] Loaded engine size: 52 MiB
[03/26/2025-10:11:52] [TRT] [W] Using an engine plan file across different models of devices is not recommended and is likely t

enter id to track (or leave blank to skip): 1


tracking object id 1
new box id: tensor(18.)
    coords:  tensor([534.3984,  19.0826, 671.9000, 272.2451])
last known:  tensor([446.5999,   1.3329, 645.8844, 241.3712])
[tensor(False), tensor(True), tensor(True), tensor(True)]
3



KeyboardInterrupt: 

In [None]:
# from images

import ipywidgets.widgets as widgets
from IPython.display import display
image_display = widgets.Image(format="jpeg", width="45%")
full_display = widgets.Image(format="jpeg", width="45%")
display(widgets.HBox([image_display, full_display]))

location_coords_display = widgets.Label()
display(location_coords_display)

m = Model()

image_display.value = m.show_all_boxes(cv2.imread("img/frame99.jpg"))

user_input = input("enter id to track (or leave blank to skip):")
m.tracked_id = int(user_input)
print("tracking object id " + str(user_input))

for i in range(100, 365):
    filename = "img/frame" + str(i + 1) + ".jpg"
    image = cv2.imread(filename)
    tracked_box = m.track(image)
    if tracked_box is False:
        image_display.value = bytes(cv2.imencode('.jpg', image)[1])
    else:
        image_rect = cv2.rectangle(
            image,
            (int(tracked_box[0]), int(tracked_box[1])),
            (int(tracked_box[2]), int(tracked_box[3])),
            (255, 0, 0),
            4
        )
        image_display.value = bytes(cv2.imencode('.jpg', image_rect)[1])

    full_display.value = m.show_all_boxes(image)

In [2]:
from ultralytics import YOLO
seg_model = YOLO("yolo11l-seg.pt")
seg_model.export(format="engine", half=True)

In [14]:
from ultralytics import YOLO
import cv2
import numpy as np

class SegModel:

    def __init__(self):
        self.yolo_model = YOLO("yolo11l_half.engine")
        self.seg_model = YOLO("yolo11l-seg.onnx")
        
        self.classes = [0] # humans
        self.tracked_id = None # remember to set before running self.track()
        self.last_position = None
        self.lost = False
        self.ignore_ids = []

        self.redetect_within = 50

    def show_all_boxes(self, image):
        """
        Get a jpg image of the frame with all detected bounding boxes with their ids
        To be used at the start to select the object to track

        params
        image (np.array) : BGR(A) array of image to analyze (From pyzed.sl.Mat().get_data())

        returns
        (bytes) jpg image with bounding boxes + ids
        """
        image = image[:, :, :3] # remove A channel from frame
        result = self.yolo_model.track(image, persist=True, classes=self.classes, verbose=False)[0]
        return self.np_to_jpeg(result.plot())


    def track(self, image):
        """
        Run the model on an image to keep track of and locate the tracked object
        If the tracked object disappeared from frame, will change mode to "lost" and will analyze new frames for any new object that:
        - Was not seen at the same time as the original object
        - Has a bounding box with similar location to the original object's last known position
        While lost, will return the input image with no bounding box
        
        When a matching object is detected, it will resume tracking that object and return the image with a bounding box

        params
        image (np.array) : BGR(A) array of image to analyze (From pyzed.sl.Mat().get_data())

        returns
        (bytes) jpg image with bounding box of tracked object
        """
        self.result = self.yolo_model.track(image[:, :, :3], persist=True, classes=self.classes, verbose=False)[0]
        if not self.lost:
            tracked_index = self._get_tracked_index()
            
            if tracked_index is False:
                # change mode to lost if tracked id not found
                self.lost = True
                return False
            else:
                # add other detected objects to list of ids to ignore (in case of tracked object lost)
                for i in self.result.boxes.id:
                    if i != self.tracked_id:
                        self.ignore_ids.append(i)
                
                self.last_position = self.result.boxes.xyxy[tracked_index]
                return self.last_position
        else:
            # if lost, wait for object with similar position to reappear
            if self.result.boxes.id is None:
                return False
            
            for i in range(len(self.result.boxes.id)):

                # ignore objects that appeared at the same time as original tracked object
                if self.result.boxes.id[i] not in self.ignore_ids:

                    coords = self.result.boxes.xyxy[i]
                    # print("new box id: " + str(self.result.boxes.id[i]))
                    # print("    coords: ", coords)
                    # print("last known: ", self.last_position)
                    corners = [
                        abs(coords[0] - self.last_position[0]) <= self.redetect_within,
                        abs(coords[1] - self.last_position[1]) <= self.redetect_within,
                        abs(coords[2] - self.last_position[2]) <= self.redetect_within,
                        abs(coords[3] - self.last_position[3]) <= self.redetect_within
                    ]
                    # print(corners)
                    # print(corners.count(True))
                    # print()
                    
                    # assume the object is the original if at least 3 corners are close to the last known position
                    if corners.count(True) >= 3:
                        self.lost = False
                        self.tracked_id = self.result.boxes.id[i]
                        return self.track(image)

            # if no suitable objects found
            return False
            
    def segment(self, image):
        self.seg_result = self.seg_model.track(image[:, :, :3], persist=True, classes=self.classes, verbose=False)[0]
        return self.seg_result.plot()

    def get_seg_masks(self, seg_id):
        for i in range(len(self.seg_result.boxes)):
            if self.seg_result.boxes.id[i] == seg_id:
                return self.seg_result.masks.xy[i]

    def _get_tracked_index(self):
        if self.result.boxes.id is None:
            return False
        if self.tracked_id in self.result.boxes.id:
            return np.where(self.result.boxes.id.numpy() == self.tracked_id)[0][0]
        else:
            return False
    
    def np_to_jpeg(self, data):
        return bytes(cv2.imencode('.jpg', data)[1])



import ipywidgets.widgets as widgets
from IPython.display import display
display_1 = widgets.Image(format="jpeg", width="45%")
display_2 = widgets.Image(format="jpeg", width="45%")
display(widgets.HBox([display_1, display_2]))

import cv2

ms = SegModel()
img = ms.segment(cv2.imread("img/frame100.jpg"))
display_1.value = ms.np_to_jpeg(img)
points = ms.get_seg_masks(int(input("id :")))
for i in points:
    img = cv2.circle(
        img,
        (int(i[0]), int(i[1])),
        1,
        (0, 0, 255),
        2
    )
display_2.value = ms.np_to_jpeg(img)

HBox(children=(Image(value=b'', format='jpeg', width='45%'), Image(value=b'', format='jpeg', width='45%')))

Loading yolo11l-seg.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider


[0;93m2025-03-26 12:03:26.873848247 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 4 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m


id : 1


In [12]:
i[0]

507.15

Loading yolo11l-seg.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider


[0;93m2025-03-26 11:53:51.191936871 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 4 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m


AttributeError: 'Results' object has no attribute 'plot_im'. See valid attributes below.

    A class for storing and manipulating inference results.

    This class encapsulates the functionality for handling detection, segmentation, pose estimation,
    and classification results from YOLO models.

    Attributes:
        orig_img (numpy.ndarray): Original image as a numpy array.
        orig_shape (Tuple[int, int]): Original image shape in (height, width) format.
        boxes (Boxes | None): Object containing detection bounding boxes.
        masks (Masks | None): Object containing detection masks.
        probs (Probs | None): Object containing class probabilities for classification tasks.
        keypoints (Keypoints | None): Object containing detected keypoints for each object.
        obb (OBB | None): Object containing oriented bounding boxes.
        speed (Dict[str, float | None]): Dictionary of preprocess, inference, and postprocess speeds.
        names (Dict[int, str]): Dictionary mapping class IDs to class names.
        path (str): Path to the image file.
        _keys (Tuple[str, ...]): Tuple of attribute names for internal use.

    Methods:
        update: Updates object attributes with new detection results.
        cpu: Returns a copy of the Results object with all tensors on CPU memory.
        numpy: Returns a copy of the Results object with all tensors as numpy arrays.
        cuda: Returns a copy of the Results object with all tensors on GPU memory.
        to: Returns a copy of the Results object with tensors on a specified device and dtype.
        new: Returns a new Results object with the same image, path, and names.
        plot: Plots detection results on an input image, returning an annotated image.
        show: Shows annotated results on screen.
        save: Saves annotated results to file.
        verbose: Returns a log string for each task, detailing detections and classifications.
        save_txt: Saves detection results to a text file.
        save_crop: Saves cropped detection images.
        tojson: Converts detection results to JSON format.

    Examples:
        >>> results = model("path/to/image.jpg")
        >>> for result in results:
        ...     print(result.boxes)  # Print detection boxes
        ...     result.show()  # Display the annotated image
        ...     result.save(filename="result.jpg")  # Save annotated image
    