In [2]:
from ultralytics import YOLO
import cv2
import numpy as np

class Model:

    def __init__(self):
        self.yolo_model = YOLO("yolo11l_half.engine")

        self.classes = [0] # humans
        self.tracked_id = None
        self.last_position = None
        self.lost = False
        self.ignore_ids = []

        self.redetect_within = 50

    def show_all_boxes(self, image):
        image = image[:, :, :3] # remove A channel from frame
        result = self.yolo_model.track(image, persist=True, classes=self.classes, verbose=False)[0]
        return self.np_to_jpeg(result.plot())


    def track(self, image, return_type="xcentre"):
        """Track objects in next frame of video feed

        parameters
        image (np.Array) : next frame of video (from zl.Mat().get_data())
        return_type (String) :
            "centre"  to return tuple of (xcentre, ycentre) coordinates
            "corners" to return tuple of (x1, y1, x2, y2) coordinates of bounding box corners
            "xcentre" to return single int for horizontal centre coordinate of bounding box
        """
        self.result = self.yolo_model.track(image[:, :, :3], persist=True, classes=self.classes, verbose=False)[0]
        if not self.lost:
            tracked_index = self._get_tracked_index()
            
            if tracked_index is False:
                # set as lost if tracked id not found
                self.lost = True
                return False
            else:
                # add other detected objects to list of ids to ignore
                for i in self.result.boxes.id:
                    if i != self.tracked_id:
                        self.ignore_ids.append(i)
                
                self.last_position = self.result.boxes.xyxy[tracked_index]

                # check return type
                if return_type == "centre":
                    return (
                        int((self.last_position[0] + self.last_position[2]) / 2),
                        int((self.last_position[1] + self.last_position[3]) / 2)
                    )
                elif return_type == "corners":
                    return self.last_position
                elif return_type == "xcentre":
                    return int((self.last_position[0] + self.last_position[2]) / 2)
        else:
            # if lost, wait for object with similar position to reappear
            if self.result.boxes.id is None:
                return False
            
            for i in range(len(self.result.boxes.id)):

                # ignore objects that appeared at the same time as original tracked object
                if self.result.boxes.id[i] not in self.ignore_ids:

                    coords = self.result.boxes.xyxy[i]
                    print("new box id: " + str(self.result.boxes.id[i]))
                    print("    coords: ", coords)
                    print("last known: ", self.last_position)
                    # TODO: not redetecting, make it more lenient
                    corners = [
                        abs(coords[0] - self.last_position[0]) <= self.redetect_within,
                        abs(coords[1] - self.last_position[1]) <= self.redetect_within,
                        abs(coords[2] - self.last_position[2]) <= self.redetect_within,
                        abs(coords[3] - self.last_position[3]) <= self.redetect_within
                    ]
                    print(corners)
                    print(corners.count(True))
                    print()
                    if corners.count(True) >= 3:
                        self.lost = False
                        self.tracked_id = self.result.boxes.id[i]
                        return self.track(image)

            # if no suitable objects found
            return False

        

    def _get_tracked_index(self):
        if self.result.boxes.id is None:
            return False
        if self.tracked_id in self.result.boxes.id:
            return np.where(self.result.boxes.id.numpy() == self.tracked_id)[0][0]
        else:
            return False
    
    def np_to_jpeg(self, data):
        return bytes(cv2.imencode('.jpg', data)[1])








In [7]:
import ipywidgets.widgets as widgets
from IPython.display import display
image_display = widgets.Image(format="jpeg", width="45%")
full_display = widgets.Image(format="jpeg", width="45%")
display(widgets.HBox([image_display, full_display]))

location_coords_display = widgets.Label()
display(location_coords_display)

import pyzed.sl as sl
camera = sl.Camera()
camera_params = sl.InitParameters()
camera_params.camera_resolution = sl.RESOLUTION.VGA
camera_params.depth_mode = sl.DEPTH_MODE.ULTRA
camera_params.coordinate_units = sl.UNIT.MILLIMETER

camera_status = camera.open(camera_params)
if camera_status != sl.ERROR_CODE.SUCCESS:
    print("camera error")
    print(camera_status)
    camera.close()
    exit()

# initialize model
m = Model()

# get initial image and choose object to track
image_mat = sl.Mat()
started_tracking = False
while not started_tracking:
    err = camera.grab()
    if err == sl.ERROR_CODE.SUCCESS:
        camera.retrieve_image(image_mat)
        image = image_mat.get_data()
        
        image_display.value = m.show_all_boxes(image)
    
        user_input = input("enter id to track (or leave blank to skip):")
        if user_input == "":
            continue
        else:
            m.tracked_id = int(user_input)
            print("tracking object id " + str(user_input))
            started_tracking = True

# start tracking
running = True
while running:
    err = camera.grab()
    if err == sl.ERROR_CODE.SUCCESS:
        camera.retrieve_image(image_mat)
        image = image_mat.get_data()

        tracked_box = m.track(image, return_type="centre")
        print(tracked_box)
        if tracked_box is False:
            image_display.value = bytes(cv2.imencode('.jpg', image)[1])
        else:
            image_rect = cv2.rectangle(
                image,
                (int(tracked_box[0]), int(tracked_box[1])),
                (int(tracked_box[2]), int(tracked_box[3])),
                (255, 0, 0),
                4
            )
            image_display.value = bytes(cv2.imencode('.jpg', image_rect)[1])

        full_display.value = m.show_all_boxes(image)
    

camera.close()

HBox(children=(Image(value=b'', format='jpeg', width='45%'), Image(value=b'', format='jpeg', width='45%')))

Label(value='')

[2025-04-30 09:02:42 UTC][ZED][INFO] Logging level INFO
[2025-04-30 09:02:42 UTC][ZED][INFO] Logging level INFO
[2025-04-30 09:02:42 UTC][ZED][INFO] Logging level INFO
[2025-04-30 09:02:43 UTC][ZED][INFO] [Init]  Depth mode: ULTRA
[2025-04-30 09:02:44 UTC][ZED][INFO] [Init]  Camera successfully opened.
[2025-04-30 09:02:44 UTC][ZED][INFO] [Init]  Camera FW version: 1523
[2025-04-30 09:02:44 UTC][ZED][INFO] [Init]  Video mode: VGA@100
[2025-04-30 09:02:44 UTC][ZED][INFO] [Init]  Serial Number: S/N 32565960
Loading yolo11l_half.engine for TensorRT inference...
[04/30/2025-10:02:44] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
[04/30/2025-10:02:44] [TRT] [I] Loaded engine size: 52 MiB
[04/30/2025-10:02:44] [TRT] [W] Using an engine plan file across different models of devices is not recommended and is likely t

enter id to track (or leave blank to skip): 1


tracking object id 1
(398, 117)


IndexError: tuple index out of range