In [1]:
# ============= RTMPose OFFICIAL SDK - From OpenMMLab =============
print("üöÄ RTMPose OFFICIAL SDK - Pre-compiled Models")
print("="*60)
print("Using official MMDeploy SDK models from OpenMMLab")
print("="*60)

# ============= CONFIGURATION =============
FRAME_SKIP = 2              # Process every Nth frame (1=all frames, 2=every 2nd)
CONF_THRESHOLD = 0.6        # Detection confidence (0.5-0.8)
VIDEO_DIR = "/content/drive/MyDrive/Sillah_test"
OUTPUT_DIR = "/content/drive/MyDrive/RTMPose_Results"
MAX_VIDEOS = 3            # Limit number of videos to process (None = all videos)
                            # Set to small number (3-5) for testing!

print(f"\n‚ö° Frame skip: {FRAME_SKIP}")
print(f"üéØ Confidence: {CONF_THRESHOLD}")
if MAX_VIDEOS:
    print(f"üé¨ Video limit: {MAX_VIDEOS} (testing mode)")
else:
    print(f"üé¨ Video limit: None (process all)")

# ============= STEP 1: Install Dependencies =============
print("\nüì¶ Installing dependencies...")
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                      "opencv-python", "onnxruntime-gpu", "numpy", "tqdm"])
print("‚úÖ Done!")

# ============= STEP 2: Import Libraries =============
import os
import time
import cv2
import numpy as np
import onnxruntime as ort
import urllib.request
import zipfile
from tqdm import tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Check videos
if os.path.exists(VIDEO_DIR):
    all_video_files = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
    print(f"‚úÖ Found {len(all_video_files)} total videos")

    # Apply limit if set
    if MAX_VIDEOS and MAX_VIDEOS > 0:
        video_files = all_video_files[:MAX_VIDEOS]
        print(f"   üé¨ Processing first {len(video_files)} videos (testing mode)")
        print(f"   üí° Set MAX_VIDEOS=None to process all {len(all_video_files)} videos")
    else:
        video_files = all_video_files
        print(f"   üé¨ Processing all {len(video_files)} videos")
else:
    raise Exception(f"Video directory not found: {VIDEO_DIR}")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============= STEP 3: Download Official SDK Package =============
print("\nüì• Downloading OFFICIAL MMDeploy SDK models...")
print("Source: https://download.openmmlab.com/mmpose/v1/projects/rtmpose/")

# Create directories
os.makedirs("/content/rtmpose-sdk", exist_ok=True)
os.chdir("/content/rtmpose-sdk")

# Download official SDK package (contains both detector and pose models)
SDK_URL = "https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-cpu.zip"
SDK_ZIP = "/content/rtmpose-sdk/rtmpose-cpu.zip"

print(f"\n   Downloading official SDK package...")
print(f"   URL: {SDK_URL}")

try:
    urllib.request.urlretrieve(SDK_URL, SDK_ZIP)
    file_size = os.path.getsize(SDK_ZIP) / (1024 * 1024)
    print(f"   ‚úÖ Downloaded: {file_size:.2f} MB")

    # Extract
    print(f"\n   üìÇ Extracting models...")
    with zipfile.ZipFile(SDK_ZIP, 'r') as zip_ref:
        zip_ref.extractall("/content/rtmpose-sdk")
    print(f"   ‚úÖ Extracted!")

    # List extracted files
    print(f"\n   üìã SDK Contents:")
    for root, dirs, files in os.walk("/content/rtmpose-sdk"):
        level = root.replace("/content/rtmpose-sdk", '').count(os.sep)
        indent = ' ' * 2 * level
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 2 * (level + 1)
        for file in files:
            if file.endswith('.onnx'):
                print(f'{subindent}‚úÖ {file}')

except Exception as e:
    print(f"   ‚ùå Download failed: {e}")
    print("\n   üìã ALTERNATIVE: Manual SDK Download")
    print("   1. Visit: https://download.openmmlab.com/mmpose/v1/projects/rtmpose/")
    print("   2. Download: rtmpose-cpu.zip")
    print("   3. Upload and extract to /content/rtmpose-sdk/")
    raise

# ============= STEP 4: Find ONNX Models in SDK =============
print("\nüîç Locating ONNX models in SDK package...")

def find_onnx_models(base_path):
    """Find detector and pose ONNX models by checking folder names"""
    det_model = None
    pose_model = None

    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('.onnx'):
                full_path = os.path.join(root, file)
                # Check folder name instead of file name
                folder_name = os.path.basename(os.path.dirname(full_path))

                if 'det' in folder_name.lower():
                    det_model = full_path
                    print(f"   ‚úÖ Detector: {folder_name}/{file}")
                elif 'pose' in folder_name.lower():
                    pose_model = full_path
                    print(f"   ‚úÖ Pose: {folder_name}/{file}")

    return det_model, pose_model

DET_MODEL_PATH, POSE_MODEL_PATH = find_onnx_models("/content/rtmpose-sdk")

if not (DET_MODEL_PATH and POSE_MODEL_PATH):
    print("\n   ‚ö†Ô∏è Could not find models by folder name")
    print("   üîç Searching and identifying by input shape...")

    # Find ONNX files and check their input shapes
    for root, dirs, files in os.walk("/content/rtmpose-sdk"):
        for file in files:
            if file.endswith('.onnx'):
                full_path = os.path.join(root, file)
                folder = os.path.basename(os.path.dirname(full_path))

                try:
                    # Load model to check input shape
                    session = ort.InferenceSession(full_path, providers=['CPUExecutionProvider'])
                    input_shape = session.get_inputs()[0].shape

                    # Detector typically has 640x640 input, pose has 192x256
                    if 'rtmdet' in folder.lower() or (len(input_shape) == 4 and input_shape[2] == input_shape[3]):
                        DET_MODEL_PATH = full_path
                        print(f"   ‚úÖ Detector: {folder}/{file} (input: {input_shape})")
                    else:
                        POSE_MODEL_PATH = full_path
                        print(f"   ‚úÖ Pose: {folder}/{file} (input: {input_shape})")
                except:
                    pass

    if not (DET_MODEL_PATH and POSE_MODEL_PATH):
        raise Exception("Could not identify detector and pose models")

print(f"\n‚úÖ Models ready!")
print(f"   Detector: {os.path.basename(DET_MODEL_PATH)}")
print(f"   Pose: {os.path.basename(POSE_MODEL_PATH)}")

# ============= STEP 5: Setup ONNX Runtime with GPU =============
print("\nüîß Setting up ONNX Runtime with GPU acceleration...")

# Configure CUDA provider (skip TensorRT to avoid fallback issues)
print("   ‚ö° Configuring CUDA...")
cuda_options = {
    'device_id': 0,
    'arena_extend_strategy': 'kNextPowerOfTwo',
    'gpu_mem_limit': 4 * 1024 * 1024 * 1024,   # 4GB
    'cudnn_conv_algo_search': 'EXHAUSTIVE',
    'do_copy_in_default_stream': True,
}

# Try CUDA first (TensorRT often causes fallback to CPU on Colab)
cuda_providers = [
    ('CUDAExecutionProvider', cuda_options),
    'CPUExecutionProvider'
]

# Session options for maximum performance
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session_options.intra_op_num_threads = 4
session_options.inter_op_num_threads = 4
session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL

print("   ‚úÖ GPU acceleration configured!")

# ============= STEP 6: Load Models =============
print("\nü§ñ Loading models...")

# Load directly with CUDA (no TensorRT to avoid fallback issues)
det_session = None
pose_session = None
provider_used = "CPU"

try:
    print("   ‚ö° Loading with CUDA...")
    det_session = ort.InferenceSession(DET_MODEL_PATH, providers=cuda_providers, sess_options=session_options)
    pose_session = ort.InferenceSession(POSE_MODEL_PATH, providers=cuda_providers, sess_options=session_options)
    provider_used = det_session.get_providers()[0]
    print(f"   ‚úÖ Models loaded successfully!")
except Exception as e:
    print(f"   ‚ö†Ô∏è GPU loading failed: {str(e)[:100]}")
    print("   üîÑ Falling back to CPU...")
    det_session = ort.InferenceSession(DET_MODEL_PATH, providers=['CPUExecutionProvider'])
    pose_session = ort.InferenceSession(POSE_MODEL_PATH, providers=['CPUExecutionProvider'])
    provider_used = "CPUExecutionProvider"

# Display results
print(f"\n   üìä Final Provider:")
if 'Tensorrt' in provider_used:
    print(f"      üî• TensorRT (Maximum Speed - 40-60 FPS)")
    expected_time = "15-25 minutes"
elif 'CUDA' in provider_used:
    print(f"      ‚ö° CUDA (Fast - 20-35 FPS)")
    expected_time = "25-45 minutes"
else:
    print(f"      ‚ö†Ô∏è CPU (Slow - 5-10 FPS)")
    expected_time = "60-120 minutes"

# Get input info
det_input_name = det_session.get_inputs()[0].name
det_input_shape = det_session.get_inputs()[0].shape
pose_input_name = pose_session.get_inputs()[0].name
pose_input_shape = pose_session.get_inputs()[0].shape

print(f"\n   üìä Model Info:")
print(f"      Detector: {det_input_name} {det_input_shape}")
print(f"      Pose: {pose_input_name} {pose_input_shape}")
print(f"\n   ‚è±Ô∏è Expected total time: {expected_time}")

# ============= STEP 7: Preprocessing Functions =============
# Get detector input size from model
det_input_shape = det_session.get_inputs()[0].shape
if len(det_input_shape) == 4:
    det_h = det_input_shape[2] if isinstance(det_input_shape[2], int) else 640
    det_w = det_input_shape[3] if isinstance(det_input_shape[3], int) else 640
else:
    det_h, det_w = 640, 640

print(f"\nüìê Detector input size: {det_w}x{det_h}")

def preprocess_detector(frame):
    """Preprocess for RTMDet"""
    img = cv2.resize(frame, (det_w, det_h))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32) / 255.0
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    img = (img - mean) / std
    img = np.transpose(img, (2, 0, 1))
    img = np.expand_dims(img, axis=0)
    return img

def postprocess_detections(outputs, conf_threshold, orig_shape):
    """Parse detector outputs"""
    boxes = []

    if len(outputs) == 0:
        return boxes

    dets = outputs[0] if isinstance(outputs, (list, tuple)) else outputs

    if len(dets.shape) == 3:
        dets = dets[0]

    h, w = orig_shape[:2]
    scale_x = w / det_w
    scale_y = h / det_h

    for det in dets:
        if len(det) >= 5:
            score = det[4]
            if score > conf_threshold:
                x1, y1, x2, y2 = det[:4]
                x1 = int(x1 * scale_x)
                y1 = int(y1 * scale_y)
                x2 = int(x2 * scale_x)
                y2 = int(y2 * scale_y)
                # Clip to image boundaries
                x1 = max(0, min(x1, w-1))
                y1 = max(0, min(y1, h-1))
                x2 = max(0, min(x2, w-1))
                y2 = max(0, min(y2, h-1))

                # Only add if box is valid
                if x2 > x1 and y2 > y1:
                    boxes.append([x1, y1, x2, y2, float(score)])

    return boxes

def process_video(video_path, output_path):
    """Process single video"""
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        return 0

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    processed = 0
    frame_idx = 0

    with tqdm(total=total_frames, desc="   Processing", unit="f") as pbar:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            frame_idx += 1

            if frame_idx % FRAME_SKIP != 0:
                out.write(frame)
                pbar.update(1)
                continue

            try:
                # Detect persons
                det_input = preprocess_detector(frame)
                det_outputs = det_session.run(None, {det_input_name: det_input})

                # Get boxes
                boxes = postprocess_detections(det_outputs, CONF_THRESHOLD, frame.shape)

                # Draw boxes
                for box in boxes:
                    x1, y1, x2, y2, conf = box
                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(frame, f"Person {conf:.2f}", (x1, y1-10),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

                processed += 1
            except:
                pass

            out.write(frame)
            pbar.update(1)

    cap.release()
    out.release()
    return processed

# ============= STEP 8: Process All Videos =============
print("\nüé¨ Processing videos...")
print("="*60)

# Show which videos will be processed
if MAX_VIDEOS and len(video_files) < len(all_video_files):
    print(f"\nüìã Videos to process (first {len(video_files)}):")
    for i, vf in enumerate(video_files[:min(10, len(video_files))], 1):
        print(f"   {i}. {vf}")
    if len(video_files) > 10:
        print(f"   ... and {len(video_files) - 10} more")
    print()

total_processed = 0
total_videos = 0
start_time = time.time()

for i, video_file in enumerate(video_files):
    print(f"\nüé¨ {i+1}/{len(video_files)}: {video_file}")

    input_path = os.path.join(VIDEO_DIR, video_file)
    output_path = os.path.join(OUTPUT_DIR, f"processed_{video_file}")

    try:
        processed = process_video(input_path, output_path)
        total_processed += processed
        total_videos += 1
        print(f"   ‚úÖ Done! {processed} frames")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")

end_time = time.time()
total_time = end_time - start_time

# ============= STEP 9: Summary =============
print("\n" + "="*60)
print("üéâ PROCESSING COMPLETE!")
print("="*60)

if MAX_VIDEOS:
    print(f"‚ÑπÔ∏è  TEST MODE: Processed {total_videos}/{len(video_files)} videos")
    print(f"   (Total available: {len(all_video_files)} videos)")
    print(f"   üí° To process all videos: Set MAX_VIDEOS=None")
else:
    print(f"‚úÖ Videos processed: {total_videos}/{len(video_files)}")

print(f"üé¨ Total frames: {total_processed:,}")
print(f"‚è±Ô∏è  Total time: {total_time:.2f}s ({total_time/60:.2f} minutes)")

if total_processed > 0:
    fps = total_processed / total_time
    print(f"üöÄ Average speed: {fps:.2f} FPS")

    # Performance rating
    if fps >= 40:
        rating = "üî• EXCELLENT (TensorRT)"
    elif fps >= 25:
        rating = "‚ö° VERY GOOD (CUDA)"
    elif fps >= 15:
        rating = "‚úÖ GOOD"
    else:
        rating = "‚ö†Ô∏è SLOW (CPU?)"
    print(f"üìä Performance: {rating}")

    # Per-video stats
    if total_videos > 0:
        avg_time = total_time / total_videos
        print(f"‚è±Ô∏è  Avg time/video: {avg_time:.2f}s")

print(f"\nüìÅ Results saved to:")
print(f"   {OUTPUT_DIR}")
print(f"\nüí° Tips:")
print(f"   - Check your Google Drive for processed videos")
print(f"   - Videos are named: processed_XXXX.mp4")
print(f"   - Total storage used: ~{total_videos * 20}MB (estimated)")
print("="*60)
print("\n‚úÖ All done! üéâ")



üöÄ RTMPose OFFICIAL SDK - Pre-compiled Models
Using official MMDeploy SDK models from OpenMMLab

‚ö° Frame skip: 2
üéØ Confidence: 0.6
üé¨ Video limit: 3 (testing mode)

üì¶ Installing dependencies...
‚úÖ Done!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Found 105 total videos
   üé¨ Processing first 3 videos (testing mode)
   üí° Set MAX_VIDEOS=None to process all 105 videos

üì• Downloading OFFICIAL MMDeploy SDK models...
Source: https://download.openmmlab.com/mmpose/v1/projects/rtmpose/

   Downloading official SDK package...
   URL: https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-cpu.zip
   ‚úÖ Downloaded: 52.04 MB

   üìÇ Extracting models...
   ‚úÖ Extracted!

   üìã SDK Contents:
rtmpose-sdk/
  rtmpose-ort/
    rtmpose-m/
      ‚úÖ end2end.onnx
    rtmdet-nano/
      ‚úÖ end2end.onnx

üîç Locating ONNX models in SDK package...
   ‚úÖ Pose: rtmpose-m/end2end.onnx


   Processing:   2%|‚ñè         | 444/19677 [00:05<04:06, 77.88f/s]


KeyboardInterrupt: 

In [None]:
import subprocess
result = subprocess.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
                       capture_output=True, text=True)
print(f"GPU: {result.stdout.strip()}")

In [None]:
# ============= RTMPose OFFICIAL SDK - From OpenMMLab =============
print("üöÄ RTMPose OFFICIAL SDK - Pre-compiled Models")
print("="*60)
print("Using official MMDeploy SDK models from OpenMMLab")
print("="*60)

# ============= CONFIGURATION =============
FRAME_SKIP = 2              # Process every Nth frame (1=all frames, 2=every 2nd)
CONF_THRESHOLD = 0.6        # Detection confidence (0.5-0.8)
VIDEO_DIR = "/content/drive/MyDrive/Sillah_test"
OUTPUT_DIR = "/content/drive/MyDrive/RTMPose_Results"
MAX_VIDEOS = 3              # Limit number of videos to process (None = all videos)
                            # Set to small number (3-5) for testing!

# ============= VISUALIZATION OPTIONS =============
SHOW_BBOX = True            # Show bounding boxes (green rectangle)
SHOW_SKELETON = True        # Show skeleton keypoints (colored lines & dots)
SHOW_LABELS = True          # Show "Person X.XX" confidence labels

# Choose visualization mode:
# SHOW_BBOX=True,  SHOW_SKELETON=True  = Both box + skeleton (recommended)
# SHOW_BBOX=True,  SHOW_SKELETON=False = Only bounding boxes (faster)
# SHOW_BBOX=False, SHOW_SKELETON=True  = Only skeleton (cleaner)

print(f"\n‚ö° Frame skip: {FRAME_SKIP}")
print(f"üéØ Confidence: {CONF_THRESHOLD}")
if MAX_VIDEOS:
    print(f"üé¨ Video limit: {MAX_VIDEOS} (testing mode)")
else:
    print(f"üé¨ Video limit: None (process all)")

print(f"\nüé® Visualization:")
print(f"   Bounding boxes: {'‚úÖ' if SHOW_BBOX else '‚ùå'}")
print(f"   Skeleton: {'‚úÖ' if SHOW_SKELETON else '‚ùå'}")
print(f"   Labels: {'‚úÖ' if SHOW_LABELS else '‚ùå'}")

# ============= STEP 1: Install Dependencies =============
print("\nüì¶ Installing dependencies...")
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                      "opencv-python", "onnxruntime-gpu", "numpy", "tqdm"])
print("‚úÖ Done!")

# ============= STEP 2: Import Libraries =============
import os
import time
import cv2
import numpy as np
import onnxruntime as ort
import urllib.request
import zipfile
from tqdm import tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Check videos
if os.path.exists(VIDEO_DIR):
    all_video_files = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
    print(f"‚úÖ Found {len(all_video_files)} total videos")

    # Apply limit if set
    if MAX_VIDEOS and MAX_VIDEOS > 0:
        video_files = all_video_files[:MAX_VIDEOS]
        print(f"   üé¨ Processing first {len(video_files)} videos (testing mode)")
        print(f"   üí° Set MAX_VIDEOS=None to process all {len(all_video_files)} videos")
    else:
        video_files = all_video_files
        print(f"   üé¨ Processing all {len(video_files)} videos")
else:
    raise Exception(f"Video directory not found: {VIDEO_DIR}")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============= STEP 3: Download Official SDK Package =============
print("\nüì• Downloading OFFICIAL MMDeploy SDK models...")
print("Source: https://download.openmmlab.com/mmpose/v1/projects/rtmpose/")

# Create directories
os.makedirs("/content/rtmpose-sdk", exist_ok=True)
os.chdir("/content/rtmpose-sdk")

# Download official SDK package (contains both detector and pose models)
SDK_URL = "https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-cpu.zip"
SDK_ZIP = "/content/rtmpose-sdk/rtmpose-cpu.zip"

print(f"\n   Downloading official SDK package...")
print(f"   URL: {SDK_URL}")

try:
    urllib.request.urlretrieve(SDK_URL, SDK_ZIP)
    file_size = os.path.getsize(SDK_ZIP) / (1024 * 1024)
    print(f"   ‚úÖ Downloaded: {file_size:.2f} MB")

    # Extract
    print(f"\n   üìÇ Extracting models...")
    with zipfile.ZipFile(SDK_ZIP, 'r') as zip_ref:
        zip_ref.extractall("/content/rtmpose-sdk")
    print(f"   ‚úÖ Extracted!")

    # List extracted files
    print(f"\n   üìã SDK Contents:")
    for root, dirs, files in os.walk("/content/rtmpose-sdk"):
        level = root.replace("/content/rtmpose-sdk", '').count(os.sep)
        indent = ' ' * 2 * level
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 2 * (level + 1)
        for file in files:
            if file.endswith('.onnx'):
                print(f'{subindent}‚úÖ {file}')

except Exception as e:
    print(f"   ‚ùå Download failed: {e}")
    print("\n   üìã ALTERNATIVE: Manual SDK Download")
    print("   1. Visit: https://download.openmmlab.com/mmpose/v1/projects/rtmpose/")
    print("   2. Download: rtmpose-cpu.zip")
    print("   3. Upload and extract to /content/rtmpose-sdk/")
    raise

# ============= STEP 4: Find ONNX Models in SDK =============
print("\nüîç Locating ONNX models in SDK package...")

def find_onnx_models(base_path):
    """Find detector and pose ONNX models by checking folder names"""
    det_model = None
    pose_model = None

    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('.onnx'):
                full_path = os.path.join(root, file)
                # Check folder name instead of file name
                folder_name = os.path.basename(os.path.dirname(full_path))

                if 'det' in folder_name.lower():
                    det_model = full_path
                    print(f"   ‚úÖ Detector: {folder_name}/{file}")
                elif 'pose' in folder_name.lower():
                    pose_model = full_path
                    print(f"   ‚úÖ Pose: {folder_name}/{file}")

    return det_model, pose_model

DET_MODEL_PATH, POSE_MODEL_PATH = find_onnx_models("/content/rtmpose-sdk")

if not (DET_MODEL_PATH and POSE_MODEL_PATH):
    print("\n   ‚ö†Ô∏è Could not find models by folder name")
    print("   üîç Searching and identifying by input shape...")

    # Find ONNX files and check their input shapes
    for root, dirs, files in os.walk("/content/rtmpose-sdk"):
        for file in files:
            if file.endswith('.onnx'):
                full_path = os.path.join(root, file)
                folder = os.path.basename(os.path.dirname(full_path))

                try:
                    # Load model to check input shape
                    session = ort.InferenceSession(full_path, providers=['CPUExecutionProvider'])
                    input_shape = session.get_inputs()[0].shape

                    # Detector typically has 640x640 input, pose has 192x256
                    if 'rtmdet' in folder.lower() or (len(input_shape) == 4 and input_shape[2] == input_shape[3]):
                        DET_MODEL_PATH = full_path
                        print(f"   ‚úÖ Detector: {folder}/{file} (input: {input_shape})")
                    else:
                        POSE_MODEL_PATH = full_path
                        print(f"   ‚úÖ Pose: {folder}/{file} (input: {input_shape})")
                except:
                    pass

    if not (DET_MODEL_PATH and POSE_MODEL_PATH):
        raise Exception("Could not identify detector and pose models")

print(f"\n‚úÖ Models ready!")
print(f"   Detector: {os.path.basename(DET_MODEL_PATH)}")
print(f"   Pose: {os.path.basename(POSE_MODEL_PATH)}")

# ============= STEP 5: Setup ONNX Runtime with GPU =============
print("\nüîß Setting up ONNX Runtime with GPU acceleration...")

# Configure CUDA provider (skip TensorRT to avoid fallback issues)
print("   ‚ö° Configuring CUDA...")
cuda_options = {
    'device_id': 0,
    'arena_extend_strategy': 'kNextPowerOfTwo',
    'gpu_mem_limit': 4 * 1024 * 1024 * 1024,   # 4GB
    'cudnn_conv_algo_search': 'EXHAUSTIVE',
    'do_copy_in_default_stream': True,
}

# Try CUDA first (TensorRT often causes fallback to CPU on Colab)
cuda_providers = [
    ('CUDAExecutionProvider', cuda_options),
    'CPUExecutionProvider'
]

# Session options for maximum performance
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session_options.intra_op_num_threads = 4
session_options.inter_op_num_threads = 4
session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL

print("   ‚úÖ GPU acceleration configured!")

# ============= STEP 6: Load Models =============
print("\nü§ñ Loading models...")

# Load directly with CUDA (no TensorRT to avoid fallback issues)
det_session = None
pose_session = None
provider_used = "CPU"

try:
    print("   ‚ö° Loading with CUDA...")
    det_session = ort.InferenceSession(DET_MODEL_PATH, providers=cuda_providers, sess_options=session_options)
    pose_session = ort.InferenceSession(POSE_MODEL_PATH, providers=cuda_providers, sess_options=session_options)
    provider_used = det_session.get_providers()[0]
    print(f"   ‚úÖ Models loaded successfully!")
except Exception as e:
    print(f"   ‚ö†Ô∏è GPU loading failed: {str(e)[:100]}")
    print("   üîÑ Falling back to CPU...")
    det_session = ort.InferenceSession(DET_MODEL_PATH, providers=['CPUExecutionProvider'])
    pose_session = ort.InferenceSession(POSE_MODEL_PATH, providers=['CPUExecutionProvider'])
    provider_used = "CPUExecutionProvider"

# Display results
print(f"\n   üìä Final Provider:")
if 'Tensorrt' in provider_used:
    print(f"      üî• TensorRT (Maximum Speed - 40-60 FPS)")
    expected_time = "15-25 minutes"
elif 'CUDA' in provider_used:
    print(f"      ‚ö° CUDA (Fast - 20-35 FPS)")
    expected_time = "25-45 minutes"
else:
    print(f"      ‚ö†Ô∏è CPU (Slow - 5-10 FPS)")
    expected_time = "60-120 minutes"

# Get input info
det_input_name = det_session.get_inputs()[0].name
det_input_shape = det_session.get_inputs()[0].shape
pose_input_name = pose_session.get_inputs()[0].name
pose_input_shape = pose_session.get_inputs()[0].shape

print(f"\n   üìä Model Info:")
print(f"      Detector: {det_input_name} {det_input_shape}")
print(f"      Pose: {pose_input_name} {pose_input_shape}")
print(f"\n   ‚è±Ô∏è Expected total time: {expected_time}")

# ============= STEP 7: Preprocessing Functions =============
# Get detector input size from model
det_input_shape = det_session.get_inputs()[0].shape
if len(det_input_shape) == 4:
    det_h = det_input_shape[2] if isinstance(det_input_shape[2], int) else 640
    det_w = det_input_shape[3] if isinstance(det_input_shape[3], int) else 640
else:
    det_h, det_w = 640, 640

print(f"\nüìê Detector input size: {det_w}x{det_h}")

def preprocess_detector(frame):
    """Preprocess for RTMDet"""
    img = cv2.resize(frame, (det_w, det_h))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32) / 255.0
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    img = (img - mean) / std
    img = np.transpose(img, (2, 0, 1))
    img = np.expand_dims(img, axis=0)
    return img

def postprocess_detections(outputs, conf_threshold, orig_shape):
    """Parse detector outputs"""
    boxes = []

    if len(outputs) == 0:
        return boxes

    dets = outputs[0] if isinstance(outputs, (list, tuple)) else outputs

    if len(dets.shape) == 3:
        dets = dets[0]

    h, w = orig_shape[:2]
    scale_x = w / det_w
    scale_y = h / det_h

    for det in dets:
        if len(det) >= 5:
            score = det[4]
            if score > conf_threshold:
                x1, y1, x2, y2 = det[:4]
                x1 = int(x1 * scale_x)
                y1 = int(y1 * scale_y)
                x2 = int(x2 * scale_x)
                y2 = int(y2 * scale_y)
                # Clip to image boundaries
                x1 = max(0, min(x1, w-1))
                y1 = max(0, min(y1, h-1))
                x2 = max(0, min(x2, w-1))
                y2 = max(0, min(y2, h-1))

                # Only add if box is valid
                if x2 > x1 and y2 > y1:
                    boxes.append([x1, y1, x2, y2, float(score)])

    return boxes

def preprocess_pose(img):
    """Preprocess image for pose estimation"""
    # Resize to model input size (256x192 for RTMPose-m)
    img_resized = cv2.resize(img, (192, 256))
    img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
    img_normalized = img_rgb.astype(np.float32) / 255.0
    img_normalized = (img_normalized - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
    img_input = np.transpose(img_normalized, (2, 0, 1))
    img_input = np.expand_dims(img_input, axis=0).astype(np.float32)
    return img_input

def postprocess_pose(outputs, orig_size):
    """Extract keypoints from pose model output"""
    keypoints = []

    if len(outputs) == 0:
        return keypoints

    # Get the output (usually first output contains keypoints)
    output = outputs[0] if isinstance(outputs, (list, tuple)) else outputs

    # Handle different output shapes
    if len(output.shape) == 3:
        output = output[0]

    # Assuming output shape is [17, 3] or similar (17 keypoints with x, y, confidence)
    if output.shape[-1] == 3:
        keypoints_raw = output
    elif len(output.shape) == 2 and output.shape[0] == 17:
        # If output is heatmaps, take argmax
        keypoints_raw = output
    else:
        # Try to reshape
        try:
            keypoints_raw = output.reshape(17, -1)
            if keypoints_raw.shape[1] < 2:
                return keypoints
        except:
            return keypoints

    # Scale keypoints to original image size
    w, h = orig_size
    for kp in keypoints_raw:
        if len(kp) >= 2:
            x = kp[0] * w / 192  # Scale from model input size
            y = kp[1] * h / 256
            conf = kp[2] if len(kp) >= 3 else 1.0
            keypoints.append([x, y, conf])

    return keypoints

def draw_skeleton(frame, keypoints, offset_x=0, offset_y=0):
    """Draw skeleton on frame"""
    # COCO skeleton connections
    skeleton = [
        (0, 1), (0, 2), (1, 3), (2, 4),  # Head
        (5, 6), (5, 7), (6, 8), (7, 9), (8, 10),  # Arms
        (5, 11), (6, 12), (11, 12),  # Torso
        (11, 13), (12, 14), (13, 15), (14, 16)  # Legs
    ]

    # Colors for different body parts
    colors = {
        'head': (255, 255, 0),    # Cyan (head)
        'arms': (0, 255, 255),    # Yellow (arms)
        'torso': (255, 0, 255),   # Magenta (torso)
        'legs': (0, 255, 0)       # Green (legs)
    }

    # Draw keypoints
    for i, kp in enumerate(keypoints):
        if len(kp) >= 3:
            x, y, conf = kp[:3]
            if conf > 0.3:  # Only draw confident keypoints
                x = int(x + offset_x)
                y = int(y + offset_y)
                cv2.circle(frame, (x, y), 4, (0, 0, 255), -1)  # Red dots

    # Draw skeleton lines
    for connection in skeleton:
        pt1_idx, pt2_idx = connection
        if pt1_idx < len(keypoints) and pt2_idx < len(keypoints):
            pt1 = keypoints[pt1_idx]
            pt2 = keypoints[pt2_idx]

            if len(pt1) >= 3 and len(pt2) >= 3:
                if pt1[2] > 0.3 and pt2[2] > 0.3:
                    x1 = int(pt1[0] + offset_x)
                    y1 = int(pt1[1] + offset_y)
                    x2 = int(pt2[0] + offset_x)
                    y2 = int(pt2[1] + offset_y)

                    # Choose color based on body part
                    if connection in [(0, 1), (0, 2), (1, 3), (2, 4)]:
                        color = colors['head']
                    elif connection in [(5, 7), (6, 8), (7, 9), (8, 10)]:
                        color = colors['arms']
                    elif connection in [(5, 11), (6, 12), (11, 12), (5, 6)]:
                        color = colors['torso']
                    else:
                        color = colors['legs']

                    cv2.line(frame, (x1, y1), (x2, y2), color, 2)

def process_video(video_path, output_path, json_path):
    """Process single video and save results"""
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        return 0

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # JSON data structure
    json_data = {
        'video_name': os.path.basename(video_path),
        'fps': fps,
        'width': width,
        'height': height,
        'total_frames': total_frames,
        'frames': []
    }

    processed = 0
    frame_idx = 0

    with tqdm(total=total_frames, desc="   Processing", unit="f") as pbar:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            frame_idx += 1

            if frame_idx % FRAME_SKIP != 0:
                out.write(frame)
                pbar.update(1)
                continue

            # Frame data for JSON
            frame_data = {
                'frame_number': frame_idx,
                'timestamp': frame_idx / fps,
                'detections': []
            }

            try:
                # Detect persons
                det_input = preprocess_detector(frame)
                det_outputs = det_session.run(None, {det_input_name: det_input})

                # Get boxes
                boxes = postprocess_detections(det_outputs, CONF_THRESHOLD, frame.shape)

                # Process each detected person
                for person_id, box in enumerate(boxes):
                    x1, y1, x2, y2, conf = box

                    # Draw bounding box if enabled
                    if SHOW_BBOX:
                        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

                    # Draw label if enabled
                    if SHOW_LABELS:
                        cv2.putText(frame, f"Person {conf:.2f}", (x1, y1-10),
                                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

                    # Run pose estimation if skeleton is enabled
                    keypoints = []
                    if SHOW_SKELETON:
                        try:
                            # Crop person region
                            person_img = frame[y1:y2, x1:x2]
                            if person_img.size == 0:
                                continue

                            # Run pose estimation
                            pose_input = preprocess_pose(person_img)
                            pose_outputs = pose_session.run(None, {pose_input_name: pose_input})
                            keypoints = postprocess_pose(pose_outputs, (x2-x1, y2-y1))

                            # Draw skeleton if keypoints found
                            if keypoints:
                                draw_skeleton(frame, keypoints, x1, y1)
                        except Exception as pose_error:
                            keypoints = []

                    # Save to JSON (always, regardless of visualization settings)
                    try:

                        # Save detection data to JSON
                        detection_data = {
                            'person_id': person_id,
                            'bbox': {
                                'x1': int(x1), 'y1': int(y1),
                                'x2': int(x2), 'y2': int(y2)
                            },
                            'confidence': float(conf),
                            'keypoints': [
                                {
                                    'x': float(kp[0]) + x1,
                                    'y': float(kp[1]) + y1,
                                    'confidence': float(kp[2]),
                                    'name': ['nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear',
                                            'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
                                            'left_wrist', 'right_wrist', 'left_hip', 'right_hip',
                                            'left_knee', 'right_knee', 'left_ankle', 'right_ankle'][i]
                                }
                                for i, kp in enumerate(keypoints) if len(kp) >= 3
                            ]
                        }
                        frame_data['detections'].append(detection_data)
                    except Exception as pose_error:
                        # If pose estimation fails, still save bbox data
                        detection_data = {
                            'person_id': person_id,
                            'bbox': {
                                'x1': int(x1), 'y1': int(y1),
                                'x2': int(x2), 'y2': int(y2)
                            },
                            'confidence': float(conf),
                            'keypoints': []
                        }
                        frame_data['detections'].append(detection_data)

                processed += 1
            except Exception as e:
                # If detection fails, continue without annotations
                pass

            # Add frame data to JSON (even if no detections)
            json_data['frames'].append(frame_data)

            out.write(frame)
            pbar.update(1)

    cap.release()
    out.release()

    # Save JSON data
    import json
    with open(json_path, 'w') as f:
        json.dump(json_data, f, indent=2)

    return processed

# ============= STEP 8: Process All Videos =============
print("\nüé¨ Processing videos...")
print("="*60)

# Show which videos will be processed
if MAX_VIDEOS and len(video_files) < len(all_video_files):
    print(f"\nüìã Videos to process (first {len(video_files)}):")
    for i, vf in enumerate(video_files[:min(10, len(video_files))], 1):
        print(f"   {i}. {vf}")
    if len(video_files) > 10:
        print(f"   ... and {len(video_files) - 10} more")
    print()

total_processed = 0
total_videos = 0
start_time = time.time()

for i, video_file in enumerate(video_files):
    print(f"\nüé¨ {i+1}/{len(video_files)}: {video_file}")

    input_path = os.path.join(VIDEO_DIR, video_file)
    video_name = os.path.splitext(video_file)[0]
    output_path = os.path.join(OUTPUT_DIR, f"processed_{video_file}")
    json_path = os.path.join(OUTPUT_DIR, f"{video_name}_pose_data.json")

    try:
        processed = process_video(input_path, output_path, json_path)
        total_processed += processed
        total_videos += 1
        print(f"   ‚úÖ Done! {processed} frames")
        print(f"   üìÑ JSON saved: {video_name}_pose_data.json")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")

end_time = time.time()
total_time = end_time - start_time

# ============= STEP 9: Summary =============
print("\n" + "="*60)
print("üéâ PROCESSING COMPLETE!")
print("="*60)

if MAX_VIDEOS:
    print(f"‚ÑπÔ∏è  TEST MODE: Processed {total_videos}/{len(video_files)} videos")
    print(f"   (Total available: {len(all_video_files)} videos)")
    print(f"   üí° To process all videos: Set MAX_VIDEOS=None")
else:
    print(f"‚úÖ Videos processed: {total_videos}/{len(video_files)}")

print(f"üé¨ Total frames: {total_processed:,}")
print(f"‚è±Ô∏è  Total time: {total_time:.2f}s ({total_time/60:.2f} minutes)")

if total_processed > 0:
    fps = total_processed / total_time
    print(f"üöÄ Average speed: {fps:.2f} FPS")

    # Performance rating
    if fps >= 40:
        rating = "üî• EXCELLENT (TensorRT)"
    elif fps >= 25:
        rating = "‚ö° VERY GOOD (CUDA)"
    elif fps >= 15:
        rating = "‚úÖ GOOD"
    else:
        rating = "‚ö†Ô∏è SLOW (CPU?)"
    print(f"üìä Performance: {rating}")

    # Per-video stats
    if total_videos > 0:
        avg_time = total_time / total_videos
        print(f"‚è±Ô∏è  Avg time/video: {avg_time:.2f}s")

print(f"\nüìÅ Results saved to:")
print(f"   {OUTPUT_DIR}")
print(f"\nüíæ Output Files:")
print(f"   ‚Ä¢ processed_XXXX.mp4 - Videos with skeleton overlay")
print(f"   ‚Ä¢ XXXX_pose_data.json - Pose keypoint data")
print(f"\nüìä JSON Format:")
print(f"   Each JSON contains:")
print(f"   - Video metadata (fps, width, height)")
print(f"   - Per-frame detections with timestamps")
print(f"   - Bounding boxes for each person")
print(f"   - 17 keypoints with (x, y, confidence, name)")
print(f"\nüí° Tips:")
print(f"   - Check your Google Drive: {OUTPUT_DIR}")
print(f"   - Use JSON files for analysis/tracking")
print(f"   - Total storage: ~{total_videos * 25}MB (estimated)")
print("="*60)
print("\n‚úÖ All done! üéâ")

