In [1]:
import cv2
# import torch
import numpy as np
from PIL import Image, ImageDraw
from ultralytics import YOLO

In [3]:
detector = YOLO('yolo11n.pt')
pose_model = YOLO('yolo11n-pose.pt')

In [4]:
def preprocess_for_detector(frame, target_size=640):
    """
    Preprocess the input image for the object detector.
    Resize, convert color, normalize, and convert to tensor.
    """
    # Resize image (assuming detector expects 640x640)
    frame_resized = cv2.resize(frame, (target_size, target_size))
    # Convert BGR to RGB
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    # Convert to float and normalize to [0, 1]
    tensor = torch.from_numpy(frame_rgb).permute(2, 0, 1).float() / 255.0
    tensor = tensor.unsqueeze(0)  # Add batch dimension
    return tensor

In [5]:
def preprocess_for_pose(roi, target_size=256):
    """
    Preprocess the cropped region for the pose model.
    Resize the ROI to the pose model's expected input size,
    convert to tensor, and normalize.
    """
    roi_resized = cv2.resize(roi, (target_size, target_size))
    roi_rgb = cv2.cvtColor(roi_resized, cv2.COLOR_BGR2RGB)
    tensor = torch.from_numpy(roi_rgb).permute(2, 0, 1).float() / 255.0
    tensor = tensor.unsqueeze(0)
    return tensor

In [6]:
def run_detector(model, input_tensor, conf_threshold=0.5):
    """
    Run the object detector and filter detections for the human class.
    Here we assume:
      - The model output is a dictionary with keys: 'boxes', 'scores', 'labels'.
      - Class index 0 corresponds to 'human'.
    """
    with torch.no_grad():
        outputs = model(input_tensor)
    # Convert outputs to numpy (this depends on your model API)
    boxes = outputs['boxes'].cpu().numpy()   # shape (N, 4) [x1, y1, x2, y2]
    scores = outputs['scores'].cpu().numpy()   # shape (N,)
    labels = outputs['labels'].cpu().numpy()   # shape (N,)
    
    detections = []
    for bbox, score, label in zip(boxes, scores, labels):
        if label == 0 and score >= conf_threshold:  # Filter for human detections
            detections.append({'bbox': bbox, 'score': score, 'class': 'human'})
    return detections

In [7]:
def run_pose_model(model, input_tensor):
    """
    Run the pose model on the input tensor.
    Assume the output is a dictionary with a key 'keypoints' that returns
    an array of shape (1, num_keypoints, 2) with coordinates in the resized ROI.
    """
    with torch.no_grad():
        outputs = model(input_tensor)
    keypoints = outputs['keypoints'].cpu().numpy()[0]
    return keypoints