In [63]:
from ultralytics import YOLO
import cv2
import numpy as np

class Model:

    def __init__(self):
        self.yolo_model = YOLO("yolo11l_half.engine")

        self.classes = [0] # humans
        self.tracked_id = None
        self.last_position = None
        self.lost = False
        self.ignore_ids = []

        self.redetect_within = 50

    def show_all_boxes(self, image):
        image = image[:, :, :-1] # remove A channel from frame
        result = self.yolo_model.track(image, persist=True, classes=self.classes, verbose=False)[0]
        return self.np_to_jpeg(result.plot())


    def track(self, image):
        self.result = self.yolo_model.track(image[:, :, :-1], persist=True, classes=self.classes, verbose=False)[0]
        if not self.lost:
            tracked_index = self._get_tracked_index()
            
            if tracked_index is False:
                # set as lost if tracked id not found
                self.lost = True
                return False
            else:
                # add other detected objects to list of ids to ignore
                for i in self.result.boxes.id:
                    if i != self.tracked_id:
                        self.ignore_ids.append(i)
                
                self.last_position = self.result.boxes.xyxy[tracked_index]
                return self.last_position
        else:
            # if lost, wait for object with similar position to reappear
            if self.result.boxes.id is None:
                return False
            
            for i in range(len(self.result.boxes.id)):

                # ignore objects that appeared at the same time as original tracked object
                if self.result.boxes.id[i] not in self.ignore_ids:

                    coords = self.result.boxes.xyxy[i]
                    print("new box", coords, self.last_position)
                    # TODO: not redetecting, make it more lenient
                    if (abs(coords[0] - self.last_position[0]) <= self.redetect_within
                        and abs(coords[1] - self.last_position[1]) <= self.redetect_within
                        and abs(coords[2] - self.last_position[2]) <= self.redetect_within
                        and abs(coords[3] - self.last_position[3]) <= self.redetect_within
                    ):
                        self.lost = False
                        return self.track(image)

            # if no suitable objects found
            return False

        

    def _get_tracked_index(self):
        if self.result.boxes.id is None:
            return False
        if self.tracked_id in self.result.boxes.id:
            return np.where(self.result.boxes.id.numpy() == self.tracked_id)[0][0]
        else:
            return False
    
    def np_to_jpeg(self, data):
        return bytes(cv2.imencode('.jpg', data)[1])








In [64]:
import ipywidgets.widgets as widgets
from IPython.display import display
image_display = widgets.Image(format="jpeg", width="45%")
full_display = widgets.Image(format="jpeg", width="45%")
display(widgets.HBox([image_display, full_display]))

location_coords_display = widgets.Label()
display(location_coords_display)

import pyzed.sl as sl
camera = sl.Camera()
camera_params = sl.InitParameters()
camera_params.camera_resolution = sl.RESOLUTION.VGA
camera_params.depth_mode = sl.DEPTH_MODE.ULTRA
camera_params.coordinate_units = sl.UNIT.MILLIMETER

camera_status = camera.open(camera_params)
if camera_status != sl.ERROR_CODE.SUCCESS:
    print("camera error")
    print(camera_status)
    camera.close()
    exit()

# initialize model
m = Model()

# get initial image and choose object to track
image_mat = sl.Mat()
started_tracking = False
while not started_tracking:
    err = camera.grab()
    if err == sl.ERROR_CODE.SUCCESS:
        camera.retrieve_image(image_mat)
        image = image_mat.get_data()
        
        image_display.value = m.show_all_boxes(image)
    
        user_input = input("enter id to track (or leave blank to skip):")
        if user_input == "":
            continue
        else:
            m.tracked_id = int(user_input)
            print("tracking object id " + str(user_input))
            started_tracking = True

# start tracking
running = True
while running:
    err = camera.grab()
    if err == sl.ERROR_CODE.SUCCESS:
        camera.retrieve_image(image_mat)
        image = image_mat.get_data()

        tracked_box = m.track(image)
        if tracked_box is False:
            image_display.value = bytes(cv2.imencode('.jpg', image)[1])
        else:
            image_rect = cv2.rectangle(
                image,
                (int(tracked_box[0]), int(tracked_box[1])),
                (int(tracked_box[2]), int(tracked_box[3])),
                (255, 0, 0),
                4
            )
            image_display.value = bytes(cv2.imencode('.jpg', image_rect)[1])

        full_display.value = m.show_all_boxes(image)
    

camera.close()

HBox(children=(Image(value=b'', format='jpeg', width='45%'), Image(value=b'', format='jpeg', width='45%')))

Label(value='')

[2025-03-19 12:38:34 UTC][ZED][INFO] Logging level INFO
[2025-03-19 12:38:34 UTC][ZED][INFO] Logging level INFO
[2025-03-19 12:38:34 UTC][ZED][INFO] Logging level INFO
[2025-03-19 12:38:35 UTC][ZED][INFO] [Init]  Depth mode: ULTRA
[2025-03-19 12:38:36 UTC][ZED][INFO] [Init]  Camera successfully opened.
[2025-03-19 12:38:36 UTC][ZED][INFO] [Init]  Camera FW version: 1523
[2025-03-19 12:38:36 UTC][ZED][INFO] [Init]  Video mode: VGA@100
[2025-03-19 12:38:36 UTC][ZED][INFO] [Init]  Serial Number: S/N 37413003
Loading yolo11l_half.engine for TensorRT inference...
[03/19/2025-12:38:36] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
[03/19/2025-12:38:36] [TRT] [I] Loaded engine size: 52 MiB
[03/19/2025-12:38:36] [TRT] [W] Using an engine plan file across different models of devices is not recommended and is likely t

enter id to track (or leave blank to skip): 1


tracking object id 1
new box tensor([ 11.1339, 116.2869,  45.6882, 138.8621]) tensor([  0.0000,   0.5776,  78.6857, 256.5381])
new box tensor([ 11.3944, 116.2871,  46.3835, 138.8620]) tensor([  0.0000,   0.5776,  78.6857, 256.5381])
new box tensor([ 11.1372, 116.3819,  46.0622, 138.7714]) tensor([  0.0000,   0.5776,  78.6857, 256.5381])
new box tensor([ 11.0498, 116.3794,  45.8683, 138.7720]) tensor([  0.0000,   0.5776,  78.6857, 256.5381])
new box tensor([ 10.8375, 116.3586,  45.4590, 138.7888]) tensor([  0.0000,   0.5776,  78.6857, 256.5381])
new box tensor([  0.3832,   0.5751, 186.0770, 275.5199]) tensor([  0.0000,   0.5776,  78.6857, 256.5381])
new box tensor([  0.5353,   0.9048, 214.1363, 277.5294]) tensor([  0.0000,   0.5776,  78.6857, 256.5381])
new box tensor([  0.5027,   1.1196, 244.9250, 278.4709]) tensor([  0.0000,   0.5776,  78.6857, 256.5381])
new box tensor([  0.3911,   0.6284, 289.6224, 276.8437]) tensor([  0.0000,   0.5776,  78.6857, 256.5381])
new box tensor([ 10.7241,

KeyboardInterrupt: 