In [None]:
# @title 1. Install Dependencies & Setup
!pip install -q mediapipe==0.10.31 yt-dlp opencv-python-headless

import os
from google.colab import drive

# Create models and output directories
os.makedirs('models', exist_ok=True)
os.makedirs('output_data', exist_ok=True)

# Mount Google Drive to access files
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Download MediaPipe Pose Landmarker model if not exists
if not os.path.exists('models/pose_landmarker.task'):
    print("Downloading pose_landmarker.task...")
    !wget -q -O models/pose_landmarker.task https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_heavy/float16/1/pose_landmarker_heavy.task
    print("Download complete.")
else:
    print("Model already exists.")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.0/182.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hMounting Google Drive...
Mounted at /content/drive
Downloading pose_landmarker.task...
Download complete.


In [None]:
# @title 2. Imports and Helper Functions
import cv2
import mediapipe as mp
import numpy as np
import json
import subprocess
import copy
from google.colab import files
from IPython.display import HTML, display
from base64 import b64encode
import yt_dlp
from tqdm.notebook import tqdm

# Hardcoded Pose connections
POSE_CONNECTIONS = frozenset([
    (0, 1), (1, 2), (2, 3), (3, 7), (0, 4), (4, 5), (5, 6), (6, 8),  # Face
    (9, 10),  # Mouth
    (11, 12), (11, 13), (13, 15), (15, 17), (15, 19), (15, 21), (17, 19),  # Left arm
    (12, 14), (14, 16), (16, 18), (16, 20), (16, 22), (18, 20),  # Right arm
    (11, 23), (12, 24), (23, 24),  # Torso
    (23, 25), (25, 27), (27, 29), (27, 31), (29, 31),  # Left leg
    (24, 26), (26, 28), (28, 30), (28, 32), (30, 32)   # Right leg
])

def display_video(video_path):
    mp4 = open(video_path,'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML(f"""
    <video width=640 controls>
          <source src="{data_url}" type="video/mp4">
    </video>
    """)

In [None]:
# @title 3. Core Processing Logic

def process_video(video_path, start_frame=0, end_frame=None, use_gpu=False):
    BaseOptions = mp.tasks.BaseOptions
    PoseLandmarker = mp.tasks.vision.PoseLandmarker
    PoseLandmarkerOptions = mp.tasks.vision.PoseLandmarkerOptions
    VisionRunningMode = mp.tasks.vision.RunningMode

    options = PoseLandmarkerOptions(
        base_options=BaseOptions(
            model_asset_path='models/pose_landmarker.task',
            delegate=BaseOptions.Delegate.GPU if use_gpu else BaseOptions.Delegate.CPU
        ),
        running_mode=VisionRunningMode.VIDEO
    )

    pose_data = []
    with PoseLandmarker.create_from_options(options) as landmarker:
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        if fps <= 0: fps = 30.0

        if end_frame is None or end_frame <= 0:
            end_frame = total_frames

        start_frame = max(0, int(start_frame))
        end_frame = min(total_frames, int(end_frame))

        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        frame_count = start_frame
        
        num_frames = end_frame - start_frame
        pbar = tqdm(total=num_frames, desc="Inferencing Pose", leave=False)

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret or frame_count >= end_frame: break

            timestamp_ms = int((frame_count / fps) * 1000)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)

            detection_result = landmarker.detect_for_video(mp_image, timestamp_ms)

            frame_landmarks = []
            if detection_result.pose_landmarks:
                for landmark in detection_result.pose_landmarks[0]:
                    frame_landmarks.append({
                        'x': landmark.x, 'y': landmark.y, 'z': landmark.z,
                        'visibility': landmark.visibility,
                    })

            pose_data.append({'frame': frame_count, 'timestamp_ms': timestamp_ms, 'landmarks': frame_landmarks})
            frame_count += 1
            pbar.update(1)

        pbar.close()
        cap.release()
    return pose_data

def render_skeleton_video(skeleton_data, output_video_path, width=1280, height=720, fps=30):
    COLOR_POINT = (0, 255, 0)
    COLOR_LINE = (0, 255, 255)
    skeleton_data.sort(key=lambda x: x['frame'])
    temp_output_path = output_video_path.replace('.mp4', '_temp.mp4')
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(temp_output_path, fourcc, fps, (width, height))

    for frame_data in tqdm(skeleton_data, desc="Rendering Skeleton", leave=False):
        img = np.zeros((height, width, 3), dtype=np.uint8)
        landmarks = frame_data.get('landmarks', [])
        points = {idx: (int(lm['x'] * width), int(lm['y'] * height))
                  for idx, lm in enumerate(landmarks) if lm.get('visibility', 1.0) >= 0.5}

        for p in points.values(): cv2.circle(img, p, 4, COLOR_POINT, -1)
        for start_idx, end_idx in POSE_CONNECTIONS:
            if start_idx in points and end_idx in points:
                cv2.line(img, points[start_idx], points[end_idx], COLOR_LINE, 2)

        out.write(img)
    out.release()

    subprocess.run(['ffmpeg', '-y', '-i', temp_output_path, '-vcodec', 'libx264', '-f', 'mp4', output_video_path],
                   stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    if os.path.exists(temp_output_path): os.remove(temp_output_path)
    return output_video_path

In [None]:
# @title 4. Choose Input Source
input_source = "Google Drive" # @param ["Upload Video (Local)", "YouTube URL", "Google Drive"]
youtube_url = "" # @param {type:"string"}
drive_file_path = "/content/drive/MyDrive/mm-fit/extracted/w02/w02_rgb.mp4" # @param {type:"string"}
# Enter frame ranges separated by comma, e.g., 0-100, 200-300
frame_ranges_str = "16302-17074" # @param {type:"string"}
# Enter labels for each range separated by comma, e.g., squat, lunge
labels_str = "dumbbell_shoulder_press_1" # @param {type:"string"}
use_gpu = True # @param {type:"boolean"}
render_video = True # @param {type:"boolean"}
preview_mode = "Video (Fast)" # @param ["Video (Fast)", "Thumbnail Grid", "None"]

source_path = ""
input_filename = "video.mp4"

# Parse frame ranges
frame_ranges = []
try:
    for r in frame_ranges_str.split(','):
        start, end = map(int, r.strip().split('-'))
        frame_ranges.append((start, end))
except Exception as e:
    print(f"Error parsing frame ranges: {e}. Using default 0-300.")
    frame_ranges = [(0, 300)]

# Parse labels
labels = [l.strip() for l in labels_str.split(',')]
if len(labels) < len(frame_ranges):
    # Pad labels if fewer than ranges
    labels.extend([f"range_{i}" for i in range(len(labels), len(frame_ranges))])
elif len(labels) > len(frame_ranges):
    # Truncate labels if more than ranges
    labels = labels[:len(frame_ranges)]

if input_source == "Upload Video (Local)":
    from google.colab import files
    uploaded = files.upload()
    if uploaded:
        source_path = list(uploaded.keys())[0]
        input_filename = source_path
        print(f"Uploaded {source_path}")
    else:
        print("No file uploaded.")

elif input_source == "YouTube URL":
    if youtube_url:
        print("Downloading YouTube video...")
        ydl_opts = {
            'format': 'best[ext=mp4]/best',
            'outtmpl': 'input_video.mp4',
            'noplaylist': True,
        }
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([youtube_url])
            source_path = 'input_video.mp4'
            input_filename = 'youtube_video.mp4'
        except Exception as e:
            print(f"Error downloading YouTube video: {e}")
    else:
        print("Please provide a YouTube URL")

elif input_source == "Google Drive":
    if drive_file_path:
        if os.path.exists(drive_file_path):
            source_path = drive_file_path
            input_filename = os.path.basename(drive_file_path)
            print(f"Using Drive file: {source_path}")
        else:
            print(f"File not found in Drive: {drive_file_path}. Ensure the path starts with /content/drive/MyDrive/...")
    else:
        print("Please provide a Google Drive file path.")

if source_path:
    print(f"Ready to process: {source_path}")
    print(f"Target frame ranges: {frame_ranges}")
    print(f"Labels: {labels}")

Using Drive file: /content/drive/MyDrive/mm-fit/extracted/w02/w02_rgb.mp4
Ready to process: /content/drive/MyDrive/mm-fit/extracted/w02/w02_rgb.mp4
Target frame ranges: [(16302, 17074)]
Labels: ['dumbbell_shoulder_press_1']


In [None]:
# @title 4.1 Preview Selected Segments
import os
import cv2
import subprocess
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

if 'source_path' in locals() and source_path and os.path.exists(source_path):
    if preview_mode == "None":
        print("Preview skipped.")
    else:
        cap = cv2.VideoCapture(source_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        cap.release()
        if fps <= 0: fps = 30.0

        print(f"Video FPS: {fps}")
        for i, (start_f, end_f) in enumerate(frame_ranges):
            label = labels[i] if i < len(labels) else f"range_{i}"
            print(f"\n--- Previewing Range {i+1}: '{label}' (Frames {start_f} to {end_f}) ---")
            
            if preview_mode == "Video (Fast)":
                start_time = start_f / fps
                duration = (end_f - start_f) / fps
                preview_path = f"preview_range_{i}.mp4"
                
                # Optimized ffmpeg: fast seek, scale down, ultrafast preset, low quality (crf 32)
                subprocess.run([
                    'ffmpeg', '-y', '-ss', str(start_time), '-t', str(duration),
                    '-i', source_path, 
                    '-vf', 'scale=480:-2', 
                    '-vcodec', 'libx264', '-preset', 'ultrafast', '-crf', '32',
                    '-acodec', 'aac', '-pix_fmt', 'yuv420p',
                    '-f', 'mp4', preview_path
                ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
                
                if os.path.exists(preview_path):
                    display(display_video(preview_path))
                    os.remove(preview_path)
                else:
                    print(f"Failed to create video preview for range {i+1}")

            elif preview_mode == "Thumbnail Grid":
                cap = cv2.VideoCapture(source_path)
                indices = np.linspace(start_f, end_f - 1, 5, dtype=int)
                fig, axes = plt.subplots(1, 5, figsize=(20, 4))
                for idx, frame_idx in enumerate(indices):
                    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
                    ret, frame = cap.read()
                    if ret:
                        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        axes[idx].imshow(frame)
                        axes[idx].set_title(f"Frame {frame_idx}")
                    axes[idx].axis('off')
                plt.tight_layout()
                plt.show()
                cap.release()
else:
    print("No video source found. Please run the 'Choose Input Source' cell first.")

In [None]:
# @title 5. Run Processing & View Results

if source_path and os.path.exists(source_path):
    input_name = os.path.splitext(input_filename)[0]

    range_pbar = tqdm(zip(frame_ranges, labels), total=len(frame_ranges), desc="Overall Progress")
    for i, ((start_f, end_f), label) in enumerate(range_pbar):
        range_pbar.set_description(f"Processing '{label}'")
        
        # Call with use_gpu=True for hardware acceleration
        skeleton_data = process_video(
            source_path,
            start_frame=start_f,
            end_frame=end_f,
            use_gpu=use_gpu
        )

        if skeleton_data:
            # Use label in filename
            safe_label = "".join([c for c in label if c.isalnum() or c in (' ', '.', '_', '-')]).strip().replace(' ', '_')

            # Save Raw JSON
            raw_json_path = f"output_data/{input_name}_{safe_label}_skeleton.json"
            with open(raw_json_path, "w") as f:
                json.dump(skeleton_data, f, indent=4)
            
            from google.colab import files
            files.download(raw_json_path)

            if render_video:
                # Render Video
                video_path = f"output_data/{input_name}_{safe_label}_rendered.mp4"
                render_skeleton_video(skeleton_data, video_path)
                
                print(f"\n--- Range {i+1} ('{label}') Results ---")
                print(f"JSON saved: {raw_json_path}")
                print(f"Video saved: {video_path}")
                display(display_video(video_path))
                files.download(video_path)
            else:
                print(f"\n--- Range {i+1} ('{label}') Results ---")
                print(f"JSON saved: {raw_json_path}")
                print("Video rendering skipped.")
else:
    print("No video source found. Please run the previous cell first.")


--- Processing Range 1: 'dumbbell_shoulder_press_1' (Frames 16302 to 17074) ---

Processing complete.
JSON saved: output_data/w02_rgb_dumbbell_shoulder_press_1_skeleton.json
Video saved: output_data/w02_rgb_dumbbell_shoulder_press_1_rendered.mp4
Displaying range 1 ('dumbbell_shoulder_press_1'):


Downloading results for 'dumbbell_shoulder_press_1'...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>