In [1]:
!pip install transformers torch torchvision pillow requests numpy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
!pip install rtmlib opencv-python

Collecting rtmlib
  Downloading rtmlib-0.0.13-py3-none-any.whl.metadata (17 kB)
Downloading rtmlib-0.0.13-py3-none-any.whl (48 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.3/48.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rtmlib
Successfully installed rtmlib-0.0.13


In [5]:
!pip install onnxruntime-gpu

Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting coloredlogs (from onnxruntime-gpu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime-gpu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime_gpu-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (283.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.2/283.2 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m4.7 MB/s[0m eta [36m0:

### **For Images**

In [13]:
import os
import json
import numpy as np
from PIL import Image
import cv2
from rtmlib import Body, draw_skeleton

# Configuration
INPUT_DIR = '/kaggle/input/demo/other/default/1'  # Directory containing input images
OUTPUT_DIR = "/kaggle/working/output_results"  # Directory to save results
device = 'cuda'  # 'cpu', 'cuda', or 'mps'
backend = 'onnxruntime'  # 'opencv', 'onnxruntime', 'openvino'

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "images"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "data"), exist_ok=True)

# Load RTMO model using Body solution with balanced mode (includes RTMO models)
print("Loading pose estimation model...")
body_estimator = Body(
    mode='balanced',  # 'performance', 'lightweight', 'balanced' (balanced includes RTMO)
    backend=backend,
    device=device
)

# Define COCO keypoint labels (17 keypoints)
keypoint_labels = [
    "nose", "left_eye", "right_eye", "left_ear", "right_ear",
    "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
    "left_wrist", "right_wrist", "left_hip", "right_hip",
    "left_knee", "right_knee", "left_ankle", "right_ankle"
]

# Define thresholds
point_threshold = 0.3
box_threshold = 0.3

def process_image(image_path):
    """Process a single image and return pose estimation results"""
    print(f"Processing: {image_path}")
    
    # Read image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Could not load image {image_path}")
        return None
    
    # Run inference
    keypoints, scores = body_estimator(image)
    
    # Extract bounding boxes and keypoints in the format similar to original JS code
    predicted_boxes = []
    predicted_points = []
    
    # rtmlib returns keypoints and scores for each detected person
    for person_idx in range(len(keypoints)):
        person_keypoints = keypoints[person_idx]  # Shape: [17, 2] - x, y coordinates
        person_scores = scores[person_idx]        # Shape: [17] - confidence scores
        
        # Create bounding box from keypoints (find min/max x,y of visible keypoints)
        visible_points = person_keypoints[person_scores > 0.1]  # Only use visible keypoints
        if len(visible_points) > 0:
            xmin = float(np.min(visible_points[:, 0]))
            ymin = float(np.min(visible_points[:, 1]))
            xmax = float(np.max(visible_points[:, 0]))
            ymax = float(np.max(visible_points[:, 1]))
            
            # Calculate average score as box score
            box_score = float(np.mean(person_scores[person_scores > 0.1]))
            
            predicted_boxes.append([xmin, ymin, xmax, ymax, box_score])
            
            # Convert keypoints to format [x, y, score]
            person_points = []
            for j in range(len(person_keypoints)):
                x, y = person_keypoints[j]
                score = person_scores[j]
                person_points.append([float(x), float(y), float(score)])
            
            predicted_points.append(person_points)
    
    return {
        'image_shape': image.shape,
        'predicted_boxes': predicted_boxes,
        'predicted_points': predicted_points,
        'raw_keypoints': keypoints.tolist() if isinstance(keypoints, np.ndarray) else keypoints,
        'raw_scores': scores.tolist() if isinstance(scores, np.ndarray) else scores
    }

def save_results(image_path, results, filename_base):
    """Save pose estimation results to files"""
    
    # Save detection data as JSON
    json_path = os.path.join(OUTPUT_DIR, "data", f"{filename_base}_pose_data.json")
    with open(json_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    # Save readable text output
    txt_path = os.path.join(OUTPUT_DIR, "data", f"{filename_base}_pose_results.txt")
    with open(txt_path, 'w') as f:
        f.write(f"Pose Estimation Results for: {os.path.basename(image_path)}\n")
        f.write("=" * 60 + "\n\n")
        
        # Display results
        for i in range(len(results['predicted_boxes'])):
            if len(results['predicted_boxes'][i]) == 5:
                xmin, ymin, xmax, ymax, box_score = results['predicted_boxes'][i]
            else:
                continue
            
            if box_score < box_threshold:
                continue
            
            x1 = round(xmin, 2)
            y1 = round(ymin, 2)
            x2 = round(xmax, 2)
            y2 = round(ymax, 2)
            
            result_line = f"Found person at [{x1}, {y1}, {x2}, {y2}] with score {box_score:.3f}\n"
            print(result_line.strip())
            f.write(result_line)
            
            if i < len(results['predicted_points']):
                points = results['predicted_points'][i]  # shape [17, 3]
                for point_id in range(len(points)):
                    if point_id < len(keypoint_labels):
                        label = keypoint_labels[point_id]
                    else:
                        label = f"point_{point_id}"
                    
                    if len(points[point_id]) >= 3:
                        x, y, point_score = points[point_id][:3]
                        if point_score < point_threshold:
                            continue
                        
                        point_line = f"  - {label}: ({round(x, 2)}, {round(y, 2)}) with score {point_score:.3f}\n"
                        print(point_line.strip())
                        f.write(point_line)
                f.write("\n")
    
    # Create and save visualization
    image = cv2.imread(image_path)
    keypoints_array = np.array(results['raw_keypoints'])
    scores_array = np.array(results['raw_scores'])
    
    # Draw skeleton on image
    img_with_pose = draw_skeleton(image, keypoints_array, scores_array, kpt_thr=0.3)
    
    # Save the visualization
    output_image_path = os.path.join(OUTPUT_DIR, "images", f"{filename_base}_pose_estimation.jpg")
    cv2.imwrite(output_image_path, img_with_pose)
    
    print(f"Results saved:")
    print(f"  - JSON data: {json_path}")
    print(f"  - Text results: {txt_path}")
    print(f"  - Visualization: {output_image_path}")

def main():
    """Main function to process all images in input directory"""
    
    # Check if input directory exists
    if not os.path.exists(INPUT_DIR):
        print(f"Creating input directory: {INPUT_DIR}")
        os.makedirs(INPUT_DIR)
        print(f"Please place your images in the '{INPUT_DIR}' directory and run again.")
        return
    
    # Supported image extensions
    supported_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
    
    # Get list of image files
    image_files = []
    for file in os.listdir(INPUT_DIR):
        if any(file.lower().endswith(ext) for ext in supported_extensions):
            image_files.append(file)
    
    if not image_files:
        print(f"No supported image files found in '{INPUT_DIR}' directory.")
        print(f"Supported formats: {', '.join(supported_extensions)}")
        return
    
    print(f"Found {len(image_files)} image(s) to process:")
    for img_file in image_files:
        print(f"  - {img_file}")
    print()
    
    # Process each image
    for image_file in image_files:
        image_path = os.path.join(INPUT_DIR, image_file)
        filename_base = os.path.splitext(image_file)[0]
        
        try:
            # Process the image
            results = process_image(image_path)
            
            if results is not None:
                # Save results
                save_results(image_path, results, filename_base)
                print("-" * 60)
            else:
                print(f"Failed to process {image_file}")
                
        except Exception as e:
            print(f"Error processing {image_file}: {str(e)}")
            continue
    
    print("\nProcessing complete!")
    print(f"Check the '{OUTPUT_DIR}' directory for results:")
    print(f"  - '{OUTPUT_DIR}/images/' contains pose estimation visualizations")
    print(f"  - '{OUTPUT_DIR}/data/' contains JSON data and text results")

if __name__ == "__main__":
    main()

Loading pose estimation model...
load /root/.cache/rtmlib/hub/checkpoints/yolox_m_8xb8-300e_humanart-c2c7a14a.onnx with onnxruntime backend


[0;93m2025-08-28 05:01:13.943517556 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-08-28 05:01:13.943545899 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
[0;93m2025-08-28 05:01:14.037578218 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-08-28 05:01:14.037601458 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


load /root/.cache/rtmlib/hub/checkpoints/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.onnx with onnxruntime backend
Found 1 image(s) to process:
  - demo.jpg

Processing: /kaggle/input/demo/other/default/1/demo.jpg
Found person at [661.52, 556.54, 1940.59, 2656.29] with score 0.975
- nose: (1334.33, 793.13) with score 0.978
- left_eye: (1386.08, 756.17) with score 0.988
- right_eye: (1304.75, 756.17) with score 1.048
- left_ear: (1452.62, 807.92) with score 0.944
- right_ear: (1260.39, 800.53) with score 0.966
- left_shoulder: (1556.13, 1000.15) with score 0.900
- right_shoulder: (1134.7, 992.76) with score 0.933
- left_elbow: (1829.69, 807.92) with score 1.027
- right_elbow: (853.75, 830.1) with score 0.995
- left_wrist: (1940.59, 556.54) with score 1.055
- right_wrist: (661.52, 571.33) with score 1.060
- left_hip: (1460.02, 1672.96) with score 0.871
- right_hip: (1179.06, 1672.96) with score 0.887
- left_knee: (1622.67, 2138.74) with score 0.979
- right_knee: (1031.1

### **For Videos**

In [15]:
import os
import json
import numpy as np
from PIL import Image
import cv2
from rtmlib import Body, draw_skeleton

# Configuration
INPUT_DIR = '/kaggle/input/rtmo/other/default/1'  # Directory containing input videos
OUTPUT_DIR = "/kaggle/working/video_output"  # Directory to save results
device = 'cuda'  # 'cpu', 'cuda', or 'mps'
backend = 'onnxruntime'  # 'opencv', 'onnxruntime', 'openvino'

# Video processing settings
SAVE_EVERY_N_FRAMES = 10  # Save detailed results every N frames (to avoid too many files)
FRAME_SKIP = 1  # Process every N frames (1 = process all frames, 2 = every other frame)
MAX_FRAMES = None  # Maximum frames to process (None = process entire video)

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "videos"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "frames"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "data"), exist_ok=True)

# Load RTMO model using Body solution with balanced mode (includes RTMO models)
print("Loading pose estimation model...")
body_estimator = Body(
    mode='balanced',  # 'performance', 'lightweight', 'balanced' (balanced includes RTMO)
    backend=backend,
    device=device
)

# Define COCO keypoint labels (17 keypoints)
keypoint_labels = [
    "nose", "left_eye", "right_eye", "left_ear", "right_ear",
    "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
    "left_wrist", "right_wrist", "left_hip", "right_hip",
    "left_knee", "right_knee", "left_ankle", "right_ankle"
]

# Define thresholds
point_threshold = 0.3
box_threshold = 0.3

def process_frame(frame):
    """Process a single frame and return pose estimation results"""
    
    # Run inference
    keypoints, scores = body_estimator(frame)
    
    # Extract bounding boxes and keypoints in the format similar to original JS code
    predicted_boxes = []
    predicted_points = []
    
    # rtmlib returns keypoints and scores for each detected person
    for person_idx in range(len(keypoints)):
        person_keypoints = keypoints[person_idx]  # Shape: [17, 2] - x, y coordinates
        person_scores = scores[person_idx]        # Shape: [17] - confidence scores
        
        # Create bounding box from keypoints (find min/max x,y of visible keypoints)
        visible_points = person_keypoints[person_scores > 0.1]  # Only use visible keypoints
        if len(visible_points) > 0:
            xmin = float(np.min(visible_points[:, 0]))
            ymin = float(np.min(visible_points[:, 1]))
            xmax = float(np.max(visible_points[:, 0]))
            ymax = float(np.max(visible_points[:, 1]))
            
            # Calculate average score as box score
            box_score = float(np.mean(person_scores[person_scores > 0.1]))
            
            predicted_boxes.append([xmin, ymin, xmax, ymax, box_score])
            
            # Convert keypoints to format [x, y, score]
            person_points = []
            for j in range(len(person_keypoints)):
                x, y = person_keypoints[j]
                score = person_scores[j]
                person_points.append([float(x), float(y), float(score)])
            
            predicted_points.append(person_points)
    
    return {
        'frame_shape': frame.shape,
        'predicted_boxes': predicted_boxes,
        'predicted_points': predicted_points,
        'raw_keypoints': keypoints.tolist() if isinstance(keypoints, np.ndarray) else keypoints,
        'raw_scores': scores.tolist() if isinstance(scores, np.ndarray) else scores
    }

def save_frame_results(frame_results, filename_base, frame_number):
    """Save pose estimation results for a specific frame"""
    
    # Save detection data as JSON
    json_path = os.path.join(OUTPUT_DIR, "data", f"{filename_base}_frame_{frame_number:06d}_pose_data.json")
    with open(json_path, 'w') as f:
        json.dump(frame_results, f, indent=2)
    
    # Save readable text output
    txt_path = os.path.join(OUTPUT_DIR, "data", f"{filename_base}_frame_{frame_number:06d}_pose_results.txt")
    with open(txt_path, 'w') as f:
        f.write(f"Pose Estimation Results for Frame {frame_number}\n")
        f.write("=" * 60 + "\n\n")
        
        # Display results
        for i in range(len(frame_results['predicted_boxes'])):
            if len(frame_results['predicted_boxes'][i]) == 5:
                xmin, ymin, xmax, ymax, box_score = frame_results['predicted_boxes'][i]
            else:
                continue
            
            if box_score < box_threshold:
                continue
            
            x1 = round(xmin, 2)
            y1 = round(ymin, 2)
            x2 = round(xmax, 2)
            y2 = round(ymax, 2)
            
            result_line = f"Found person at [{x1}, {y1}, {x2}, {y2}] with score {box_score:.3f}\n"
            f.write(result_line)
            
            if i < len(frame_results['predicted_points']):
                points = frame_results['predicted_points'][i]  # shape [17, 3]
                for point_id in range(len(points)):
                    if point_id < len(keypoint_labels):
                        label = keypoint_labels[point_id]
                    else:
                        label = f"point_{point_id}"
                    
                    if len(points[point_id]) >= 3:
                        x, y, point_score = points[point_id][:3]
                        if point_score < point_threshold:
                            continue
                        
                        point_line = f"  - {label}: ({round(x, 2)}, {round(y, 2)}) with score {point_score:.3f}\n"
                        f.write(point_line)
                f.write("\n")

def process_video(video_path):
    """Process a video file and return pose estimation results for all frames"""
    print(f"Processing video: {video_path}")
    
    # Open video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return None
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"Video info: {width}x{height}, {fps} FPS, {total_frames} frames")
    
    # Prepare output video writer
    filename_base = os.path.splitext(os.path.basename(video_path))[0]
    output_video_path = os.path.join(OUTPUT_DIR, "videos", f"{filename_base}_pose_estimation.mp4")
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
    
    # Store all frame results
    all_frame_results = []
    frame_number = 0
    processed_frames = 0
    
    print("Processing frames...")
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Skip frames if needed
        if frame_number % FRAME_SKIP != 0:
            frame_number += 1
            continue
        
        # Process frame
        try:
            frame_results = process_frame(frame)
            
            if frame_results is not None:
                # Add frame number to results
                frame_results['frame_number'] = frame_number
                frame_results['timestamp'] = frame_number / fps
                all_frame_results.append(frame_results)
                
                # Create visualization
                keypoints_array = np.array(frame_results['raw_keypoints'])
                scores_array = np.array(frame_results['raw_scores'])
                
                # Draw skeleton on frame
                frame_with_pose = draw_skeleton(frame.copy(), keypoints_array, scores_array, kpt_thr=0.3)
                
                # Write frame to output video
                out.write(frame_with_pose)
                
                # Save detailed results for every Nth frame
                if frame_number % SAVE_EVERY_N_FRAMES == 0:
                    save_frame_results(frame_results, filename_base, frame_number)
                    
                    # Save frame image
                    frame_image_path = os.path.join(OUTPUT_DIR, "frames", f"{filename_base}_frame_{frame_number:06d}.jpg")
                    cv2.imwrite(frame_image_path, frame_with_pose)
                
                processed_frames += 1
                
                # Print progress
                if frame_number % (fps * 5) == 0:  # Every 5 seconds
                    progress = (frame_number / total_frames) * 100
                    print(f"Progress: {progress:.1f}% - Frame {frame_number}/{total_frames}")
                
                # Check if we've reached max frames limit
                if MAX_FRAMES is not None and processed_frames >= MAX_FRAMES:
                    print(f"Reached maximum frames limit: {MAX_FRAMES}")
                    break
            
        except Exception as e:
            print(f"Error processing frame {frame_number}: {str(e)}")
        
        frame_number += 1
    
    # Release resources
    cap.release()
    out.release()
    
    # Save complete video analysis
    video_summary = {
        'video_path': video_path,
        'video_info': {
            'width': width,
            'height': height,
            'fps': fps,
            'total_frames': total_frames,
            'duration_seconds': total_frames / fps
        },
        'processing_info': {
            'frames_processed': processed_frames,
            'frame_skip': FRAME_SKIP,
            'save_every_n_frames': SAVE_EVERY_N_FRAMES
        },
        'frame_results': all_frame_results
    }
    
    # Save complete analysis as JSON
    video_json_path = os.path.join(OUTPUT_DIR, "data", f"{filename_base}_complete_analysis.json")
    with open(video_json_path, 'w') as f:
        json.dump(video_summary, f, indent=2)
    
    # Save video summary as text
    summary_txt_path = os.path.join(OUTPUT_DIR, "data", f"{filename_base}_video_summary.txt")
    with open(summary_txt_path, 'w') as f:
        f.write(f"Video Pose Estimation Summary\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Video: {os.path.basename(video_path)}\n")
        f.write(f"Resolution: {width}x{height}\n")
        f.write(f"FPS: {fps}\n")
        f.write(f"Duration: {total_frames/fps:.2f} seconds\n")
        f.write(f"Total Frames: {total_frames}\n")
        f.write(f"Processed Frames: {processed_frames}\n")
        f.write(f"Frame Skip: {FRAME_SKIP}\n\n")
        
        # Statistics
        total_persons = sum(len(frame['predicted_boxes']) for frame in all_frame_results)
        avg_persons_per_frame = total_persons / len(all_frame_results) if all_frame_results else 0
        
        f.write(f"Statistics:\n")
        f.write(f"  - Total person detections: {total_persons}\n")
        f.write(f"  - Average persons per frame: {avg_persons_per_frame:.2f}\n")
        f.write(f"  - Frames with detections: {len([f for f in all_frame_results if len(f['predicted_boxes']) > 0])}\n")
    
    return {
        'output_video': output_video_path,
        'summary_json': video_json_path,
        'summary_txt': summary_txt_path,
        'processed_frames': processed_frames
    }

def main():
    """Main function to process all videos in input directory"""
    
    # Check if input directory exists
    if not os.path.exists(INPUT_DIR):
        print(f"Creating input directory: {INPUT_DIR}")
        os.makedirs(INPUT_DIR)
        print(f"Please place your videos in the '{INPUT_DIR}' directory and run again.")
        return
    
    # Supported video extensions
    supported_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv', '.webm', '.m4v'}
    
    # Get list of video files
    video_files = []
    for file in os.listdir(INPUT_DIR):
        if any(file.lower().endswith(ext) for ext in supported_extensions):
            video_files.append(file)
    
    if not video_files:
        print(f"No supported video files found in '{INPUT_DIR}' directory.")
        print(f"Supported formats: {', '.join(supported_extensions)}")
        return
    
    print(f"Found {len(video_files)} video(s) to process:")
    for vid_file in video_files:
        print(f"  - {vid_file}")
    print()
    
    print(f"Processing settings:")
    print(f"  - Device: {device}")
    print(f"  - Backend: {backend}")
    print(f"  - Frame skip: {FRAME_SKIP} (process every {FRAME_SKIP} frame(s))")
    print(f"  - Save detailed results every: {SAVE_EVERY_N_FRAMES} frames")
    print(f"  - Max frames: {MAX_FRAMES if MAX_FRAMES else 'unlimited'}")
    print()
    
    # Process each video
    for video_file in video_files:
        video_path = os.path.join(INPUT_DIR, video_file)
        filename_base = os.path.splitext(video_file)[0]
        
        try:
            print(f"{'='*60}")
            print(f"Processing: {video_file}")
            print(f"{'='*60}")
            
            # Process the video
            results = process_video(video_path)
            
            if results is not None:
                print(f"\n✅ Video processing complete!")
                print(f"Results saved:")
                print(f"  - Output video: {results['output_video']}")
                print(f"  - Complete analysis: {results['summary_json']}")
                print(f"  - Text summary: {results['summary_txt']}")
                print(f"  - Processed {results['processed_frames']} frames")
                print(f"  - Frame images saved to: {OUTPUT_DIR}/frames/")
                print()
            else:
                print(f"❌ Failed to process {video_file}")
                
        except Exception as e:
            print(f"❌ Error processing {video_file}: {str(e)}")
            continue
    
    print("\n🎉 All videos processed!")
    print(f"Check the '{OUTPUT_DIR}' directory for results:")
    print(f"  - '{OUTPUT_DIR}/videos/' contains pose estimation videos")
    print(f"  - '{OUTPUT_DIR}/frames/' contains sample frame images with poses")
    print(f"  - '{OUTPUT_DIR}/data/' contains JSON data and text summaries")

def print_frame_results(frame_results, frame_number):
    """Print pose estimation results for a frame"""
    print(f"\nFrame {frame_number} results:")
    
    for i in range(len(frame_results['predicted_boxes'])):
        if len(frame_results['predicted_boxes'][i]) == 5:
            xmin, ymin, xmax, ymax, box_score = frame_results['predicted_boxes'][i]
        else:
            continue
        
        if box_score < box_threshold:
            continue
        
        x1 = round(xmin, 2)
        y1 = round(ymin, 2)
        x2 = round(xmax, 2)
        y2 = round(ymax, 2)
        
        print(f"  Found person at [{x1}, {y1}, {x2}, {y2}] with score {box_score:.3f}")
        
        if i < len(frame_results['predicted_points']):
            points = frame_results['predicted_points'][i]  # shape [17, 3]
            visible_keypoints = []
            for point_id in range(len(points)):
                if point_id < len(keypoint_labels):
                    label = keypoint_labels[point_id]
                else:
                    label = f"point_{point_id}"
                
                if len(points[point_id]) >= 3:
                    x, y, point_score = points[point_id][:3]
                    if point_score >= point_threshold:
                        visible_keypoints.append(f"{label}: ({round(x, 2)}, {round(y, 2)}) score={point_score:.3f}")
            
            # Only show first few keypoints to avoid cluttering console
            print(f"    Key points: {len(visible_keypoints)} visible")
            for kpt in visible_keypoints[:3]:  # Show first 3 keypoints
                print(f"      - {kpt}")
            if len(visible_keypoints) > 3:
                print(f"      ... and {len(visible_keypoints)-3} more")

if __name__ == "__main__":
    main()

Loading pose estimation model...
load /root/.cache/rtmlib/hub/checkpoints/yolox_m_8xb8-300e_humanart-c2c7a14a.onnx with onnxruntime backend
load /root/.cache/rtmlib/hub/checkpoints/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.onnx with onnxruntime backend
Found 1 video(s) to process:
  - test.mp4

Processing settings:
  - Device: cuda
  - Backend: onnxruntime
  - Frame skip: 1 (process every 1 frame(s))
  - Save detailed results every: 10 frames
  - Max frames: unlimited

Processing: test.mp4
Processing video: /kaggle/input/rtmo/other/default/1/test.mp4


[0;93m2025-08-28 05:07:42.582590849 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-08-28 05:07:42.582620602 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
[0;93m2025-08-28 05:07:42.663185946 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-08-28 05:07:42.663211917 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Video info: 1920x1080, 25 FPS, 315 frames
Processing frames...
Progress: 0.0% - Frame 0/315
Progress: 39.7% - Frame 125/315
Progress: 79.4% - Frame 250/315

✅ Video processing complete!
Results saved:
  - Output video: /kaggle/working/video_output/videos/test_pose_estimation.mp4
  - Complete analysis: /kaggle/working/video_output/data/test_complete_analysis.json
  - Text summary: /kaggle/working/video_output/data/test_video_summary.txt
  - Processed 315 frames
  - Frame images saved to: /kaggle/working/video_output/frames/


🎉 All videos processed!
Check the '/kaggle/working/video_output' directory for results:
  - '/kaggle/working/video_output/videos/' contains pose estimation videos
  - '/kaggle/working/video_output/frames/' contains sample frame images with poses
  - '/kaggle/working/video_output/data/' contains JSON data and text summaries


### **With .npz file**

In [17]:
import os
import json
import numpy as np
from PIL import Image
import cv2
from rtmlib import Body, draw_skeleton
import shutil
from datetime import datetime

# Configuration
INPUT_DIR = '/kaggle/input/rtmo/other/default/1'  # Directory containing input videos
BASE_OUTPUT_DIR = "/kaggle/working/pose_estimation_results"  # Base directory to save all results
device = 'cuda'  # 'cpu', 'cuda', or 'mps'
backend = 'onnxruntime'  # 'opencv', 'onnxruntime', 'openvino'

# Video processing settings
FRAME_SKIP = 1  # Process every N frames (1 = process all frames, 2 = every other frame)
MAX_FRAMES = None  # Maximum frames to process (None = process entire video)

# Create organized directory structure
def create_project_structure():
    """Create organized folder structure for the pose estimation project"""
    
    # Main project directories
    dirs_to_create = [
        BASE_OUTPUT_DIR,
        os.path.join(BASE_OUTPUT_DIR, "input_videos"),
        os.path.join(BASE_OUTPUT_DIR, "output_videos"),
        os.path.join(BASE_OUTPUT_DIR, "pose_data_npz")
    ]
    
    for dir_path in dirs_to_create:
        os.makedirs(dir_path, exist_ok=True)
    
    return {
        'input_videos': os.path.join(BASE_OUTPUT_DIR, "input_videos"),
        'output_videos': os.path.join(BASE_OUTPUT_DIR, "output_videos"),
        'pose_data_npz': os.path.join(BASE_OUTPUT_DIR, "pose_data_npz")
    }

# Create directory structure
output_paths = create_project_structure()

# Load RTMO model using Body solution with balanced mode (includes RTMO models)
print("Loading pose estimation model...")
body_estimator = Body(
    mode='balanced',  # 'performance', 'lightweight', 'balanced' (balanced includes RTMO)
    backend=backend,
    device=device
)

# Define COCO keypoint labels (17 keypoints)
keypoint_labels = [
    "nose", "left_eye", "right_eye", "left_ear", "right_ear",
    "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
    "left_wrist", "right_wrist", "left_hip", "right_hip",
    "left_knee", "right_knee", "left_ankle", "right_ankle"
]

# Define thresholds
point_threshold = 0.3
box_threshold = 0.3

def process_frame(frame):
    """Process a single frame and return pose estimation results"""
    
    # Run inference
    keypoints, scores = body_estimator(frame)
    
    # Extract bounding boxes and keypoints in the format similar to original JS code
    predicted_boxes = []
    predicted_points = []
    
    # rtmlib returns keypoints and scores for each detected person
    for person_idx in range(len(keypoints)):
        person_keypoints = keypoints[person_idx]  # Shape: [17, 2] - x, y coordinates
        person_scores = scores[person_idx]        # Shape: [17] - confidence scores
        
        # Create bounding box from keypoints (find min/max x,y of visible keypoints)
        visible_points = person_keypoints[person_scores > 0.1]  # Only use visible keypoints
        if len(visible_points) > 0:
            xmin = float(np.min(visible_points[:, 0]))
            ymin = float(np.min(visible_points[:, 1]))
            xmax = float(np.max(visible_points[:, 0]))
            ymax = float(np.max(visible_points[:, 1]))
            
            # Calculate average score as box score
            box_score = float(np.mean(person_scores[person_scores > 0.1]))
            
            predicted_boxes.append([xmin, ymin, xmax, ymax, box_score])
            
            # Convert keypoints to format [x, y, score]
            person_points = []
            for j in range(len(person_keypoints)):
                x, y = person_keypoints[j]
                score = person_scores[j]
                person_points.append([float(x), float(y), float(score)])
            
            predicted_points.append(person_points)
    
    return {
        'frame_shape': frame.shape,
        'predicted_boxes': predicted_boxes,
        'predicted_points': predicted_points,
        'raw_keypoints': keypoints,
        'raw_scores': scores
    }

def save_pose_data_npz(all_frame_results, filename_base, video_info):
    """Save all pose data in NPZ format"""
    
    if not all_frame_results:
        print("⚠️  No frame results to save")
        return None
    
    # Prepare data for NPZ
    num_frames = len(all_frame_results)
    max_persons = max(len(frame['predicted_boxes']) for frame in all_frame_results) if all_frame_results else 0
    
    if max_persons == 0:
        max_persons = 1  # Ensure at least 1 person dimension
    
    # Frame metadata
    frame_numbers = np.array([frame['frame_number'] for frame in all_frame_results])
    timestamps = np.array([frame['timestamp'] for frame in all_frame_results])
    
    # Initialize pose data arrays
    bboxes = np.full((num_frames, max_persons, 5), -1.0, dtype=np.float32)
    keypoints_coords = np.full((num_frames, max_persons, 17, 2), -1.0, dtype=np.float32)
    keypoints_scores = np.full((num_frames, max_persons, 17), -1.0, dtype=np.float32)
    person_valid = np.zeros((num_frames, max_persons), dtype=bool)
    
    # Fill arrays with data
    for frame_idx, frame_result in enumerate(all_frame_results):
        num_persons = len(frame_result['predicted_boxes'])
        
        for person_idx in range(min(num_persons, max_persons)):
            # Bounding boxes
            if len(frame_result['predicted_boxes'][person_idx]) == 5:
                bboxes[frame_idx, person_idx] = frame_result['predicted_boxes'][person_idx]
                person_valid[frame_idx, person_idx] = True
            
            # Keypoints
            if person_idx < len(frame_result['predicted_points']):
                points = frame_result['predicted_points'][person_idx]
                for kpt_idx in range(min(len(points), 17)):
                    if len(points[kpt_idx]) >= 3:
                        x, y, score = points[kpt_idx][:3]
                        keypoints_coords[frame_idx, person_idx, kpt_idx] = [x, y]
                        keypoints_scores[frame_idx, person_idx, kpt_idx] = score
    
    # Save NPZ file
    npz_path = os.path.join(output_paths['pose_data_npz'], f"{filename_base}_pose_data.npz")
    
    np.savez_compressed(
        npz_path,
        # Time information
        frame_numbers=frame_numbers,
        timestamps=timestamps,
        
        # Pose data (separate coordinates and scores for easier analysis)
        bboxes=bboxes,
        keypoints_coords=keypoints_coords,
        keypoints_scores=keypoints_scores,
        person_valid=person_valid,
        
        # Metadata (stored as JSON strings)
        video_info=video_info,
        keypoint_labels=keypoint_labels,
        thresholds=np.array([point_threshold, box_threshold]),
        
        # Processing info
        processing_settings={
            'frame_skip': FRAME_SKIP,
            'device': device,
            'backend': backend,
            'processing_timestamp': datetime.now().isoformat()
        }
    )
    
    print(f"💾 NPZ file saved: {npz_path}")
    print(f"📏 Data shape: {num_frames} frames, up to {max_persons} persons, 17 keypoints each")
    
    return npz_path

def copy_input_videos(video_files):
    """Copy input videos to the organized structure"""
    copied_videos = []
    
    for video_file in video_files:
        src_path = os.path.join(INPUT_DIR, video_file)
        dst_path = os.path.join(output_paths['input_videos'], video_file)
        
        try:
            shutil.copy2(src_path, dst_path)
            copied_videos.append(video_file)
            print(f"📁 Copied input video: {video_file}")
        except Exception as e:
            print(f"❌ Error copying {video_file}: {str(e)}")
    
    return copied_videos

def process_video(video_path):
    """Process a video file and return pose estimation results for all frames"""
    print(f"🎬 Processing video: {os.path.basename(video_path)}")
    
    # Open video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"❌ Error: Could not open video {video_path}")
        return None
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    video_info = {
        'width': width,
        'height': height,
        'fps': fps,
        'total_frames': total_frames,
        'duration_seconds': total_frames / fps
    }
    
    print(f"📹 Video info: {width}x{height}, {fps} FPS, {total_frames} frames, {total_frames/fps:.2f}s")
    
    # Prepare output video writer
    filename_base = os.path.splitext(os.path.basename(video_path))[0]
    output_video_path = os.path.join(output_paths['output_videos'], f"{filename_base}_pose_estimation.mp4")
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
    
    # Store all frame results
    all_frame_results = []
    frame_number = 0
    processed_frames = 0
    
    print("🔄 Processing frames...")
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Skip frames if needed
        if frame_number % FRAME_SKIP != 0:
            frame_number += 1
            continue
        
        # Process frame
        try:
            frame_results = process_frame(frame)
            
            if frame_results is not None:
                # Add frame metadata
                frame_results['frame_number'] = frame_number
                frame_results['timestamp'] = frame_number / fps
                all_frame_results.append(frame_results)
                
                # Create visualization
                keypoints_array = np.array(frame_results['raw_keypoints'])
                scores_array = np.array(frame_results['raw_scores'])
                
                # Draw skeleton on frame
                frame_with_pose = draw_skeleton(frame.copy(), keypoints_array, scores_array, kpt_thr=0.3)
                
                # Write frame to output video
                out.write(frame_with_pose)
                
                processed_frames += 1
                
                # Print progress
                if frame_number % (fps * 5) == 0:  # Every 5 seconds
                    progress = (frame_number / total_frames) * 100
                    print(f"⏳ Progress: {progress:.1f}% - Frame {frame_number}/{total_frames}")
                
                # Check if we've reached max frames limit
                if MAX_FRAMES is not None and processed_frames >= MAX_FRAMES:
                    print(f"🛑 Reached maximum frames limit: {MAX_FRAMES}")
                    break
            
        except Exception as e:
            error_msg = f"❌ Error processing frame {frame_number}: {str(e)}"
            print(error_msg)
        
        frame_number += 1
    
    # Close resources
    cap.release()
    out.release()
    
    # Save pose data in NPZ format
    print("💾 Saving pose data to NPZ format...")
    npz_path = save_pose_data_npz(all_frame_results, filename_base, video_info)
    
    return {
        'output_video': output_video_path,
        'npz_file': npz_path,
        'processed_frames': processed_frames
    }

def main():
    """Main function to process all videos in input directory"""
    
    # Check if input directory exists
    if not os.path.exists(INPUT_DIR):
        print(f"📁 Creating input directory: {INPUT_DIR}")
        os.makedirs(INPUT_DIR)
        print(f"Please place your videos in the '{INPUT_DIR}' directory and run again.")
        return
    
    # Supported video extensions
    supported_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv', '.webm', '.m4v'}
    
    # Get list of video files
    video_files = []
    for file in os.listdir(INPUT_DIR):
        if any(file.lower().endswith(ext) for ext in supported_extensions):
            video_files.append(file)
    
    if not video_files:
        print(f"❌ No supported video files found in '{INPUT_DIR}' directory.")
        print(f"Supported formats: {', '.join(supported_extensions)}")
        return
    
    print(f"🎯 Found {len(video_files)} video(s) to process:")
    for vid_file in video_files:
        print(f"  📹 {vid_file}")
    print()
    
    print(f"⚙️  Processing settings:")
    print(f"  🖥️  Device: {device}")
    print(f"  🔧 Backend: {backend}")
    print(f"  ⏭️  Frame skip: {FRAME_SKIP} (process every {FRAME_SKIP} frame(s))")
    print(f"  🔢 Max frames: {MAX_FRAMES if MAX_FRAMES else 'unlimited'}")
    print()
    
    # Copy input videos to organized structure
    print("📋 Organizing input videos...")
    copied_videos = copy_input_videos(video_files)
    
    # Process each video
    all_results = []
    
    for video_file in video_files:
        video_path = os.path.join(INPUT_DIR, video_file)
        filename_base = os.path.splitext(video_file)[0]
        
        try:
            print(f"\n{'='*70}")
            print(f"🚀 Processing: {video_file}")
            print(f"{'='*70}")
            
            # Process the video
            results = process_video(video_path)
            
            if results is not None:
                all_results.append(results)
                
                print(f"\n✅ Video processing complete!")
                print(f"📂 Results saved:")
                print(f"  🎥 Output video: {os.path.basename(results['output_video'])}")
                print(f"  💾 NPZ pose data: {os.path.basename(results['npz_file'])}")
                print(f"  ✨ Processed {results['processed_frames']} frames")
                print()
            else:
                print(f"❌ Failed to process {video_file}")
                
        except Exception as e:
            print(f"💥 Error processing {video_file}: {str(e)}")
            continue
    
    print(f"\n🎉 All videos processed successfully!")
    print(f"📁 Check the '{BASE_OUTPUT_DIR}' directory for results")
    print(f"\n📋 Quick Access:")
    print(f"  🎬 Output videos: {output_paths['output_videos']}")
    print(f"  💾 NPZ pose data: {output_paths['pose_data_npz']}")

if __name__ == "__main__":
    main()

Loading pose estimation model...
load /root/.cache/rtmlib/hub/checkpoints/yolox_m_8xb8-300e_humanart-c2c7a14a.onnx with onnxruntime backend
load /root/.cache/rtmlib/hub/checkpoints/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.onnx with onnxruntime backend


[0;93m2025-08-28 05:21:46.912625616 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-08-28 05:21:46.912661929 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
[0;93m2025-08-28 05:21:46.991684704 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-08-28 05:21:46.991710777 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


🎯 Found 1 video(s) to process:
  📹 test.mp4

⚙️  Processing settings:
  🖥️  Device: cuda
  🔧 Backend: onnxruntime
  ⏭️  Frame skip: 1 (process every 1 frame(s))
  🔢 Max frames: unlimited

📋 Organizing input videos...
📁 Copied input video: test.mp4

🚀 Processing: test.mp4
🎬 Processing video: test.mp4
📹 Video info: 1920x1080, 25 FPS, 315 frames, 12.60s
🔄 Processing frames...
⏳ Progress: 0.0% - Frame 0/315
⏳ Progress: 39.7% - Frame 125/315
⏳ Progress: 79.4% - Frame 250/315
💾 Saving pose data to NPZ format...
💾 NPZ file saved: /kaggle/working/pose_estimation_results/pose_data_npz/test_pose_data.npz
📏 Data shape: 315 frames, up to 2 persons, 17 keypoints each

✅ Video processing complete!
📂 Results saved:
  🎥 Output video: test_pose_estimation.mp4
  💾 NPZ pose data: test_pose_data.npz
  ✨ Processed 315 frames


🎉 All videos processed successfully!
📁 Check the '/kaggle/working/pose_estimation_results' directory for results

📋 Quick Access:
  🎬 Output videos: /kaggle/working/pose_estimation_r