# Tutorial 5 -- Human Detection 
In this tutorial, we will use multithreading to capture both color and depth images. We will then employ the YOLOv11 neural network architecture for human detection. To ensure real-time performance, we will utilize the TensorRT deep learning framework. For more details, visit the following website: https://docs.ultralytics.com/guides/nvidia-jetson/#convert-model-to-tensorrt-and-run-inference

In [None]:
# I have uploaded two YOLO TensorRT models to the LEARN page: the YOLO11L FP16 version and the YOLO11N FP32 version. The first model, 'YOLO11n.engine', runs at a faster speed but has limited detection accuracy. The second model, 'yolo11l_half.engine', is the FP16 version, as indicated by 'half' in its name.
# If you need other versions, please refer to the following link:
# https://docs.ultralytics.com/modes/export/#arguments

# Below is the code I used to convert the YOLO11L FP16 model
# from ultralytics import YOLO
# model = YOLO("yolo11l.pt")  
# model.export(format="engine",half=True)  # FP16
# Note: Generating the 'yolo11l.engine' file may take a long time.

### Step 1 Run YOLOV11 on recorded video data

Before running the program, download the pre-trained detection model and the video file 'color_video.avi', then place them in the current folder.

In [None]:
import cv2
from ultralytics import YOLO
import ipywidgets.widgets as widgets
from IPython.display import display


#create widgets for the displaying of the image
display_color = widgets.Image(format='jpeg', width='45%') 
display_depth = widgets.Image(format='jpeg', width='45%')  
layout=widgets.Layout(width='100%')

sidebyside = widgets.HBox([display_color, display_depth],layout=layout) #horizontal 
display(sidebyside) #display the widget

#Convert a NumPy array to JPEG-encoded data for display
def bgr8_to_jpeg(value):
    return bytes(cv2.imencode('.jpg',value)[1])

# Load the YOLO model
model = YOLO("yolo11l_half.engine")
# model = YOLO("yolo11n.engine")

# Open the video file
video_path = "color_video.avi"
cap = cv2.VideoCapture(video_path)

# Set the video codec and save the processed video.
fourcc = cv2.VideoWriter_fourcc(*'XVID')  # 'mp4v' codec, suitable for MP4 files
width, height = 672,376 #VGA resolution
fps = 30
color_file = cv2.VideoWriter('color_video_processed.avi', fourcc, fps, (width, height))

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        t1 = cv2.getTickCount()
        # Run YOLO inference on the frame
        results = model(frame,verbose=False)

        # Visualize the results on the frame
        annotated_frame = results[0].plot()
        color_file.write(annotated_frame)

        framed = frame.copy()
        #set a confidence threshold to filter out unconfident boxes
        #https://docs.ultralytics.com/modes/predict/#boxes
        conf_threshold = 0.5
        for result in results:
            #get the human subject
            for i in range (len(result.boxes.cls)):
                if(result.boxes.cls[i] == 0):  #human subject
                    # print(result.boxes.xywh[i])
                    if (result.boxes.conf[i] > conf_threshold): #you
                        # print()
                        bbox = result.boxes.xyxy[i]
                        # print(bbox)
                        cv2.rectangle(framed, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)
                        # cv2.imwrite('human.jpg',color_img)
                
        scale = 0.3
        resized_image = cv2.resize(annotated_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
        resized_image2 = cv2.resize(framed, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
        display_color.value = bgr8_to_jpeg(resized_image)
        display_depth.value = bgr8_to_jpeg(resized_image2)
        
        # total_time = (cv2.getTickCount() - t1) / cv2.getTickFrequency()
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
color_file.release()

### Step 2 Start the ZED2i Camera system

In [None]:
# You will need to load the YOLO model if you skip the first code block.
# from ultralytics import YOLO
# model = YOLO("yolo11l_half.engine")

#Start the camera system
import traitlets
import cv2
import numpy as np
import pyzed.sl as sl
import math
import numpy as np
import sys
import math
import threading
from traitlets.config.configurable import SingletonConfigurable

# Define a Camera class that inherits from SingletonConfigurable
class Camera(SingletonConfigurable):
    color_value = traitlets.Any() # monitor the color_value variable
    def __init__(self):
        super(Camera, self).__init__()

        self.zed = sl.Camera()
        # Create a InitParameters object and set configuration parameters
        init_params = sl.InitParameters()
        init_params.camera_resolution = sl.RESOLUTION.VGA #VGA(672*376), HD720(1280*720), HD1080 (1920*1080) or ...
        init_params.depth_mode = sl.DEPTH_MODE.ULTRA  # Use ULTRA depth mode
        init_params.coordinate_units = sl.UNIT.MILLIMETER  # Use meter units (for depth measurements)

        # Open the camera
        status = self.zed.open(init_params)
        if status != sl.ERROR_CODE.SUCCESS: #Ensure the camera has opened succesfully
            print("Camera Open : "+repr(status)+". Exit program.")
            self.zed.close()
            exit(1)

         # Create and set RuntimeParameters after opening the camera
        self.runtime = sl.RuntimeParameters()

        #flag to control the thread
        self.thread_runnning_flag = False

        # Get the height and width
        camera_info = self.zed.get_camera_information()
        self.width = camera_info.camera_configuration.resolution.width
        self.height = camera_info.camera_configuration.resolution.height
        self.image = sl.Mat(self.width,self.height,sl.MAT_TYPE.U8_C4, sl.MEM.CPU)
        self.depth = sl.Mat(self.width,self.height,sl.MAT_TYPE.F32_C1, sl.MEM.CPU)
        self.point_cloud = sl.Mat(self.width,self.height,sl.MAT_TYPE.F32_C4, sl.MEM.CPU) 

    def _capture_frames(self): #For data capturing only

        while(self.thread_runnning_flag==True): #continue until the thread_runnning_flag is set to be False
            if self.zed.grab(self.runtime) == sl.ERROR_CODE.SUCCESS:
                
                # Retrieve Left image
                self.zed.retrieve_image(self.image, sl.VIEW.LEFT)
                # Retrieve depth map. Depth is aligned on the left image
                self.zed.retrieve_measure(self.depth, sl.MEASURE.DEPTH)
    
                self.color_value_BGRA = self.image.get_data()
                self.color_value= cv2.cvtColor(self.color_value_BGRA, cv2.COLOR_BGRA2BGR)
                self.depth_image = np.asanyarray(self.depth.get_data())   
                
    def start(self): #start the data capture thread
        if self.thread_runnning_flag == False: #only process if no thread is running yet
            self.thread_runnning_flag=True #flag to control the operation of the _capture_frames function
            self.thread = threading.Thread(target=self._capture_frames) #link thread with the function
            self.thread.start() #start the thread

    def stop(self): #stop the data capture thread
        if self.thread_runnning_flag == True:
            self.thread_runnning_flag = False #exit the while loop in the _capture_frames
            self.thread.join() #wait the exiting of the thread       

def bgr8_to_jpeg(value):#convert numpy array to jpeg coded data for displaying 
    return bytes(cv2.imencode('.jpg',value)[1])

#create a camera object
camera = Camera()
camera.start() # start capturing the data

### Step 3 Perform object detection on live video data

In [None]:
import time
import motors
import cv2
import numpy as np
import ipywidgets.widgets as widgets
from IPython.display import display
from collections import deque

# Initialize the MotorsYukon class for motor control
robot = motors.MotorsYukon(mecanum=False)

# Initialize tracking history
tracking_history = deque(maxlen=5)  

def move_forward(speed=0.8):
    print(f"Moving forward at speed {speed}")
    robot.forward(speed)

def stop_robot():
    print("Stopping robot")
    robot.stop()

def turn_left(speed=0.5):
    print(f"Turning left at speed {speed}")
    robot.left(speed)

def turn_right(speed=0.5):
    print(f"Turning right at speed {speed}")
    robot.right(speed)

# Create widgets for displaying the image
display_color = widgets.Image(format='jpeg', width='50%')
display_depth = widgets.Image(format='jpeg', width='50%')
layout = widgets.Layout(width='100%')

sidebyside = widgets.HBox([display_color, display_depth], layout=layout)
display(sidebyside)

selected_person = None
previous_action = None
previous_time = time.time()
previous_depth_value = 1500  

# Thresholds
target_distance = 1700  # Move forward if farther than 1.7m
safe_distance = 500  # Stop if closer than 0.9m
turn_threshold = 100  # Allow more margin for turns
conf_threshold = 0.4  # Confidence threshold for detection

# Function to ensure bounding box tracking is consistent
def iou(box1, box2):
    x1, y1, x2, y2 = box1
    x1b, y1b, x2b, y2b = box2

    xi1 = max(x1, x1b)
    yi1 = max(y1, y1b)
    xi2 = min(x2, x2b)
    yi2 = min(y2, y2b)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - y1)

    box1_area = (x2 - x1) * (y2 - y1)
    box2_area = (x2b - x1b) * (y2b - y1b)
    union_area = box1_area + box2_area - inter_area

    return inter_area / union_area if union_area else 0

# Convert image to JPEG format for widgets
def bgr8_to_jpeg(value):
    return bytes(cv2.imencode('.jpg', value)[1])

# Callback function for human tracking and obstacle avoidance
def func(change):
    global selected_person, previous_action, previous_time, previous_depth_value
    frame = change['new']
    results = model(frame, verbose=False)

    print(f"📡 Detected objects: {len(results[0].boxes.cls)}")  

    closest_person = None
    min_depth = float('inf')

    for result in results:  
        for i in range(len(result.boxes.cls)):
            if result.boxes.cls[i] == 0:  # Human detected
                bbox = result.boxes.xyxy[i]
                x_center = int((bbox[0] + bbox[2]) / 2)
                y_center = int((bbox[1] + bbox[3]) / 2)

                if 0 <= y_center < camera.depth_image.shape[0] and 0 <= x_center < camera.depth_image.shape[1]:
                    depth_value = camera.depth_image[y_center, x_center]
                else:
                    depth_value = target_distance  

                if depth_value is None or np.isnan(depth_value):
                    print("⚠️ Invalid depth value detected! Using last known depth.")
                    depth_value = previous_depth_value  

                previous_depth_value = depth_value  

                print(f"Human detected at index {i} with confidence {result.boxes.conf[i]} at depth {depth_value} mm")

                # Track the closest person
                if depth_value < min_depth and result.boxes.conf[i] > conf_threshold:
                    min_depth = depth_value
                    closest_person = i  

    # Assign the closest detected person as the selected person
    if selected_person is None and closest_person is not None:
        selected_person = closest_person
        tracking_history.append(result.boxes.xyxy[selected_person])
        print(f"First person detected! Tracking person ID: {selected_person}")

    # If the tracked person disappears, reselect using IoU
    if selected_person is not None and (selected_person >= len(result.boxes.cls)):
        print(" Lost tracked person! Selecting a new closest person.")
        best_match = -1
        best_iou = 0

        for i in range(len(result.boxes.cls)):
            if result.boxes.cls[i] == 0:
                new_bbox = result.boxes.xyxy[i]
                avg_iou = sum(iou(new_bbox, old_bbox) for old_bbox in tracking_history) / len(tracking_history)
                if avg_iou > best_iou:
                    best_iou = avg_iou
                    best_match = i

        selected_person = best_match if best_match != -1 else closest_person

    if selected_person is not None:
        bbox = result.boxes.xyxy[selected_person]
        x_center = int((bbox[0] + bbox[2]) / 2)
        y_center = int((bbox[1] + bbox[3]) / 2)

        print(f"Tracking person depth: {depth_value} mm")

        if depth_value < safe_distance or np.isnan(depth_value):  
            print("Obstacle detected! Stopping to avoid collision.")
            robot.stop()
            return  

        if depth_value > target_distance:
            print("Moving forward towards target")
            robot.forward(0.8)  

        frame_center = camera.width // 2
        if x_center < frame_center - turn_threshold:
            print("Turning left to follow target.")
            robot.left(0.5)  

        elif x_center > frame_center + turn_threshold:
            print("Turning right to follow target.")
            robot.right(0.5)  

    # Ensure valid depth values (convert NaN to 0)
    depth_image = camera.depth_image.copy()
    depth_image[np.isnan(depth_image)] = 0  # Convert NaNs to 0
    depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)

    # Update the widgets
    scale = 0.3
    resized_color = cv2.resize(frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
    resized_depth = cv2.resize(depth_colormap, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)

    display_color.value = bgr8_to_jpeg(resized_color)
    display_depth.value = bgr8_to_jpeg(resized_depth)

camera.observe(func, names=['color_value'])