# Tutorial 5 -- Human Detection 
In this tutorial, we will use multithreading to capture both color and depth images. We will then employ the YOLOv11 neural network architecture for human detection. To ensure real-time performance, we will utilize the TensorRT deep learning framework. For more details, visit the following website: https://docs.ultralytics.com/guides/nvidia-jetson/#convert-model-to-tensorrt-and-run-inference

In [None]:
# I have uploaded two YOLO TensorRT models to the LEARN page: the YOLO11L FP16 version and the YOLO11N FP32 version. The first model, 'YOLO11n.engine', runs at a faster speed but has limited detection accuracy. The second model, 'yolo11l_half.engine', is the FP16 version, as indicated by 'half' in its name.
# If you need other versions, please refer to the following link:
# https://docs.ultralytics.com/modes/export/#arguments

# Below is the code I used to convert the YOLO11L FP16 model
# from ultralytics import YOLO
# model = YOLO("yolo11l.pt")  
# model.export(format="engine",half=True)  # FP16
# Note: Generating the 'yolo11l.engine' file may take a long time.

### Step 1 Run YOLOV11 on recorded video data

Before running the program, download the pre-trained detection model and the video file 'color_video.avi', then place them in the current folder.

In [None]:
import cv2
from ultralytics import YOLO
import ipywidgets.widgets as widgets
from IPython.display import display


#create widgets for the displaying of the image
display_color = widgets.Image(format='jpeg', width='45%') 
display_depth = widgets.Image(format='jpeg', width='45%')  
layout=widgets.Layout(width='100%')

sidebyside = widgets.HBox([display_color, display_depth],layout=layout) #horizontal 
display(sidebyside) #display the widget

#Convert a NumPy array to JPEG-encoded data for display
def bgr8_to_jpeg(value):
    return bytes(cv2.imencode('.jpg',value)[1])

# Load the YOLO model
model = YOLO("yolo11l_half.engine")
# model = YOLO("yolo11n.engine")

# Open the video file
video_path = "color_video.avi"
cap = cv2.VideoCapture(video_path)

# Set the video codec and save the processed video.
fourcc = cv2.VideoWriter_fourcc(*'XVID')  # 'mp4v' codec, suitable for MP4 files
width, height = 672,376 #VGA resolution
fps = 30
color_file = cv2.VideoWriter('color_video_processed.avi', fourcc, fps, (width, height))

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        t1 = cv2.getTickCount()
        # Run YOLO inference on the frame
        results = model(frame,verbose=False)

        # Visualize the results on the frame
        annotated_frame = results[0].plot()
        color_file.write(annotated_frame)

        framed = frame.copy()
        #set a confidence threshold to filter out unconfident boxes
        #https://docs.ultralytics.com/modes/predict/#boxes
        conf_threshold = 0.5
        for result in results:
            #get the human subject
            for i in range (len(result.boxes.cls)):
                if(result.boxes.cls[i] == 0):  #human subject
                    # print(result.boxes.xywh[i])
                    if (result.boxes.conf[i] > conf_threshold): #you
                        # print()
                        bbox = result.boxes.xyxy[i]
                        # print(bbox)
                        cv2.rectangle(framed, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)
                        # cv2.imwrite('human.jpg',color_img)
                
        scale = 0.3
        resized_image = cv2.resize(annotated_frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
        resized_image2 = cv2.resize(framed, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
        display_color.value = bgr8_to_jpeg(resized_image)
        display_depth.value = bgr8_to_jpeg(resized_image2)
        
        # total_time = (cv2.getTickCount() - t1) / cv2.getTickFrequency()
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
color_file.release()

### Step 2 Start the ZED2i Camera system

In [None]:
# You will need to load the YOLO model if you skip the first code block.
# from ultralytics import YOLO
# model = YOLO("yolo11l_half.engine")

#Start the camera system
import traitlets
import cv2
import numpy as np
import pyzed.sl as sl
import math
import numpy as np
import sys
import math
import threading
from traitlets.config.configurable import SingletonConfigurable

# Define a Camera class that inherits from SingletonConfigurable
class Camera(SingletonConfigurable):
    color_value = traitlets.Any() # monitor the color_value variable
    def __init__(self):
        super(Camera, self).__init__()

        self.zed = sl.Camera()
        # Create a InitParameters object and set configuration parameters
        init_params = sl.InitParameters()
        init_params.camera_resolution = sl.RESOLUTION.VGA #VGA(672*376), HD720(1280*720), HD1080 (1920*1080) or ...
        init_params.depth_mode = sl.DEPTH_MODE.ULTRA  # Use ULTRA depth mode
        init_params.coordinate_units = sl.UNIT.MILLIMETER  # Use meter units (for depth measurements)

        # Open the camera
        status = self.zed.open(init_params)
        if status != sl.ERROR_CODE.SUCCESS: #Ensure the camera has opened succesfully
            print("Camera Open : "+repr(status)+". Exit program.")
            self.zed.close()
            exit(1)

         # Create and set RuntimeParameters after opening the camera
        self.runtime = sl.RuntimeParameters()

        #flag to control the thread
        self.thread_runnning_flag = False

        # Get the height and width
        camera_info = self.zed.get_camera_information()
        self.width = camera_info.camera_configuration.resolution.width
        self.height = camera_info.camera_configuration.resolution.height
        self.image = sl.Mat(self.width,self.height,sl.MAT_TYPE.U8_C4, sl.MEM.CPU)
        self.depth = sl.Mat(self.width,self.height,sl.MAT_TYPE.F32_C1, sl.MEM.CPU)
        self.point_cloud = sl.Mat(self.width,self.height,sl.MAT_TYPE.F32_C4, sl.MEM.CPU) 

    def _capture_frames(self): #For data capturing only

        while(self.thread_runnning_flag==True): #continue until the thread_runnning_flag is set to be False
            if self.zed.grab(self.runtime) == sl.ERROR_CODE.SUCCESS:
                
                # Retrieve Left image
                self.zed.retrieve_image(self.image, sl.VIEW.LEFT)
                # Retrieve depth map. Depth is aligned on the left image
                self.zed.retrieve_measure(self.depth, sl.MEASURE.DEPTH)
    
                self.color_value_BGRA = self.image.get_data()
                self.color_value= cv2.cvtColor(self.color_value_BGRA, cv2.COLOR_BGRA2BGR)
                self.depth_image = np.asanyarray(self.depth.get_data())   
                
    def start(self): #start the data capture thread
        if self.thread_runnning_flag == False: #only process if no thread is running yet
            self.thread_runnning_flag=True #flag to control the operation of the _capture_frames function
            self.thread = threading.Thread(target=self._capture_frames) #link thread with the function
            self.thread.start() #start the thread

    def stop(self): #stop the data capture thread
        if self.thread_runnning_flag == True:
            self.thread_runnning_flag = False #exit the while loop in the _capture_frames
            self.thread.join() #wait the exiting of the thread       

def bgr8_to_jpeg(value):#convert numpy array to jpeg coded data for displaying 
    return bytes(cv2.imencode('.jpg',value)[1])

#create a camera object
camera = Camera()
camera.start() # start capturing the data

In [None]:
class Robot:
    def __init__(self):
        # Initialize the robot's motor control system
        try:
            from jetbot import Robot as JetBot
            self.bot = JetBot()
        except ImportError:
            self.bot = None  # If running on a non-JetBot system, just simulate actions

    def move_forward(self, speed=0.4):
        if self.bot:
            self.bot.forward(speed)
        print("Robot moving forward.")

    def stop(self):
        if self.bot:
            self.bot.stop()
        print("Robot stopping.")

    def turn_left(self, speed=0.3):
        if self.bot:
            self.bot.left(speed)
        print("Robot turning left.")

    def turn_right(self, speed=0.3):
        if self.bot:
            self.bot.right(speed)
        print("Robot turning right.")

# Create a global robot instance
robot = Robot()

### Step 3 Perform object detection on live video data

In [None]:
import cv2
import ipywidgets.widgets as widgets
from IPython.display import display
import time

# Create widgets for displaying the image
display_color = widgets.Image(format='jpeg', width='45%')
display_depth = widgets.Image(format='jpeg', width='45%')
layout = widgets.Layout(width='100%')

sidebyside = widgets.HBox([display_color, display_depth], layout=layout)  # Horizontal
display(sidebyside)  # Display the widget

selected_person = None

# Initialize previous movement
previous_action = None
previous_time = time.time()

# Thresholds
target_distance = 1500  # Move forward if farther than 1.5m
safe_distance = 1000  # Stop if closer than 1m
turn_threshold = 80  # Allow some margin before turning

# Callback function, invoked when traitlets detect a change in camera frame
def func(change):
    global selected_person, previous_action, previous_time
    frame = change['new']
    results = model(frame, verbose=False)  # Run YOLO detection

    conf_threshold = 0.5  # Minimum confidence for detection

    for result in results:  # Process only one frame at a time
        for i in range(len(result.boxes.cls)):
            if result.boxes.cls[i] == 0:  # Human detected
                if selected_person is None:
                    selected_person = i  # Auto-assign first detected person
                    print(f"Auto-selected person ID: {selected_person}")

                if i == selected_person:
                    bbox = result.boxes.xyxy[i]
                    x_center = int((bbox[0] + bbox[2]) / 2)
                    y_center = int((bbox[1] + bbox[3]) / 2)

                    # Ensure depth value index is within bounds
                    if 0 <= y_center < camera.depth_image.shape[0] and 0 <= x_center < camera.depth_image.shape[1]:
                        depth_value = camera.depth_image[y_center, x_center]
                    else:
                        depth_value = target_distance  # Assume max distance if depth is unknown

                    current_time = time.time()
                    time_diff = current_time - previous_time
                    movement_action = None

                    # Move forward if too far
                    if depth_value > target_distance:
                        movement_action = "move_forward"
                        if previous_action != movement_action or time_diff > 0.5:
                            print("Moving forward towards target")
                            robot.move_forward()

                    # Stop if too close
                    elif depth_value < safe_distance:
                        movement_action = "stop"
                        if previous_action != movement_action or time_diff > 0.5:
                            print("Stopping to maintain safe distance")
                            robot.stop()

                    # Adjust direction based on target position
                    frame_center = camera.width // 2
                    if x_center < frame_center - turn_threshold:
                        movement_action = "turn_left"
                        if previous_action != movement_action or time_diff > 0.5:
                            print("Turning left to follow target.")
                            robot.turn_left()
                    elif x_center > frame_center + turn_threshold:
                        movement_action = "turn_right"
                        if previous_action != movement_action or time_diff > 0.5:
                            print("Turning right to follow target.")
                            robot.turn_right()

                    # Update previous action and time
                    if movement_action:
                        previous_action = movement_action
                        previous_time = current_time

                    # Draw bounding box around the selected person
                    x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Green for selected person

    # Scaling is necessary for real-time data display.
    scale = 0.1
    resized_image = cv2.resize(frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
    display_color.value = bgr8_to_jpeg(resized_image)

    # Depth map visualization
    depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(camera.depth_image, alpha=0.03), cv2.COLORMAP_JET)
    resized_depth_colormap = cv2.resize(depth_colormap, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
    display_depth.value = bgr8_to_jpeg(resized_depth_colormap)

camera.observe(func, names=['color_value'])

## Tasks

1. Please try to calculate the distance between the detected human and the robot using the depth image. (Note: You can refer to Tutorial 2 to obtain the depth information for a specific point.) 

2. Please try to add a collision avoidance function to this program to protect the robot.  

3. Think about how to control the robot so that it moves towards the detected human. 