In [1]:
%%capture
!pip install -U yt_dlp youtube-search-python

In [2]:
%%capture
!pip install moviepy

In [3]:
%%capture
!pip install "httpx<0.27" --force-reinstall

In [9]:
%%capture
!python3 -m pip install opencv-python==4.9.0.80 mediapipe==0.10.5 torch==2.2.0

In [5]:
from youtubesearchpython import VideosSearch
import yt_dlp
import os

def parse_duration(duration_str):
    parts = duration_str.split(':')
    if len(parts) == 2:  # mm:ss
        minutes, seconds = map(int, parts)
        return minutes * 60 + seconds
    elif len(parts) == 3:  # hh:mm:ss
        hours, minutes, seconds = map(int, parts)
        return hours * 3600 + minutes * 60 + seconds
    return 0  # if unknown or invalid

def download_videos(query, label, num_videos=15, save_dir='videos'):
    path = os.path.join(save_dir, label)
    os.makedirs(path, exist_ok=True)

    collected = 0
    search = VideosSearch(query, limit=30)  # Fetch more to filter

    for result in search.result()['result']:
        if 'duration' not in result:
            continue  # Skip livestreams or missing info

        duration_sec = parse_duration(result['duration'])
        if duration_sec >= 300:
            continue  # Skip videos 5 min or longer

        url = result['link']
        output_filename = os.path.join(path, f"{label}_{collected + 1}.mp4")

        ydl_opts = {
            'format': 'best[ext=mp4]/best',
            'outtmpl': output_filename,
            'quiet': True,
            'noplaylist': True,
        }

        try:
            print(f"Downloading [{label}] video {collected + 1}: {result['title']}")
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            collected += 1
        except Exception as e:
            print(f"Failed to download {url}: {e}")

        if collected >= num_videos:
            break

# Download 15 short ballet and 15 short hip-hop videos
download_videos("ballet dance performance", "ballet", num_videos=15)
download_videos("hip hop dance performance", "hiphop", num_videos=15)

Downloading [ballet] video 1: Jeeho Lee WOWS the audience with the La Esmeralda Finale!
Downloading [ballet] video 2: Dance of the Sugar Plum Fairy from The Nutcracker (The Royal Ballet)
Downloading [ballet] video 3: Swan Lake – Dance of the cygnets (The Royal Ballet)
Downloading [ballet] video 4: Ella is FLYING 😍🩰✨ #ballerina #ballet #shorts #ad
Downloading [ballet] video 5: OKC Ballet collaborates with Dance for Parkinson's group for unique performance
Downloading [ballet] video 6: BALLET in 30 sec - GISELLE - Maria Khoreva in #shorts
Downloading [ballet] video 7: WOAH!! THAT BALANCE 😱 @tessa_rivadulla #ballet #balletclass
Downloading [ballet] video 8: Ballet is beautiful 🥹❤️🩰 #ballet #shorts #shortfilm
Downloading [ballet] video 9: Watch Our Favorite Tiny Dancer Perform the Nutcracker with the New York City Ballet
Downloading [ballet] video 10: Don Quixote – Act III Kitri Variation (Akane Takada, The Royal Ballet)
Downloading [ballet] video 11: Swan Lake – End of Act II (The Royal B

In [1]:
!pip install numpy==1.26.4 --force-reinstall

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydf 0.11.0 requires protobuf<6.0.0,>=5.29.1, but

In [3]:
from moviepy.editor import VideoFileClip
import os

def split_video(video_path, clip_length=30, output_dir="clips"):
    try:
        video = VideoFileClip(video_path)
        duration = int(video.duration)
        base_name = os.path.splitext(os.path.basename(video_path))[0]

        os.makedirs(output_dir, exist_ok=True)

        clip_count = 0
        for start in range(0, duration, clip_length):
            end = min(start + clip_length, duration)
            subclip = video.subclip(start, end)
            output_path = os.path.join(output_dir, f"{base_name}_part{clip_count + 1}.mp4")
            subclip.write_videofile(output_path, codec="libx264", audio_codec="aac", logger=None)
            clip_count += 1

        print(f"✅ Done: {video_path} → {clip_count} clips.")
    except Exception as e:
        print(f"❌ Failed to process {video_path}: {e}")

# Process all videos in both folders
for label in ['ballet', 'hiphop']:
    input_dir = f"videos/{label}"
    output_dir = f"clips/{label}"
    for filename in os.listdir(input_dir):
        if filename.endswith(".mp4"):
            full_path = os.path.join(input_dir, filename)
            split_video(full_path, clip_length=30, output_dir=output_dir)

✅ Done: videos/ballet/ballet_12.mp4 → 1 clips.
✅ Done: videos/ballet/ballet_2.mp4 → 6 clips.
✅ Done: videos/ballet/ballet_1.mp4 → 1 clips.
✅ Done: videos/ballet/ballet_8.mp4 → 1 clips.
✅ Done: videos/ballet/ballet_3.mp4 → 4 clips.
✅ Done: videos/ballet/ballet_13.mp4 → 4 clips.
✅ Done: videos/ballet/ballet_11.mp4 → 2 clips.
✅ Done: videos/ballet/ballet_10.mp4 → 3 clips.
✅ Done: videos/ballet/ballet_5.mp4 → 4 clips.
✅ Done: videos/ballet/ballet_6.mp4 → 2 clips.
✅ Done: videos/ballet/ballet_9.mp4 → 4 clips.
✅ Done: videos/ballet/ballet_7.mp4 → 1 clips.
✅ Done: videos/ballet/ballet_4.mp4 → 1 clips.
✅ Done: videos/hiphop/hiphop_9.mp4 → 7 clips.
✅ Done: videos/hiphop/hiphop_10.mp4 → 9 clips.
✅ Done: videos/hiphop/hiphop_13.mp4 → 8 clips.
✅ Done: videos/hiphop/hiphop_6.mp4 → 7 clips.
✅ Done: videos/hiphop/hiphop_3.mp4 → 7 clips.
✅ Done: videos/hiphop/hiphop_12.mp4 → 5 clips.
✅ Done: videos/hiphop/hiphop_11.mp4 → 1 clips.
✅ Done: videos/hiphop/hiphop_8.mp4 → 9 clips.
✅ Done: videos/hiphop/hiph

In [1]:
import cv2
import numpy as np
import mediapipe as mp
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# ----------------------
# Config
# ----------------------
TARGET_SIZE = (224, 224)
SEQUENCE_LENGTH = 32
STRIDE = 16
USE_SKELETON = True
NUM_CLASSES = 5

In [3]:
# ----------------------
# Pose Estimation Setup
# ----------------------
mp_pose = mp.solutions.pose
pose_model = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)

def extract_pose(frame):
    results = pose_model.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    if not results.pose_landmarks:
        return np.zeros((33, 3))  # x, y, z
    return np.array([[l.x, l.y, l.z] for l in results.pose_landmarks.landmark])

In [4]:
def process_video(video_path, use_pose=True, frame_skip=3):
    cap = cv2.VideoCapture(video_path)
    frames, keypoints = [], []
    frame_idx = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_idx % frame_skip == 0:
            frame = cv2.resize(frame, TARGET_SIZE)

            if use_pose:
                keypoints.append(extract_pose(frame).flatten())

            frame = frame.astype(np.float32) / 255.0
            frames.append(frame)

        frame_idx += 1

    cap.release()
    return np.array(frames), np.array(keypoints) if use_pose else None

In [5]:
# ----------------------
# Create Fixed-Length Clips
# ----------------------
def create_clips(frames, keypoints=None, sequence_length=32, stride=16):
    clips, pose_clips = [], []
    for i in range(0, len(frames) - sequence_length + 1, stride):
        if keypoints is not None:
            pose_clip = keypoints[i:i + sequence_length]
            if len(pose_clip) == sequence_length:
                pose_clips.append(pose_clip)

    return np.array(pose_clips)

In [6]:
class PoseTransformer(nn.Module):
    def __init__(self, input_size=99, seq_len=32, num_classes=NUM_CLASSES, d_model=128, nhead=4, num_layers=2):
        super(PoseTransformer, self).__init__()
        self.input_fc = nn.Linear(input_size, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))  # learnable [CLS] token
        self.fc_out = nn.Linear(d_model, num_classes)

    def forward(self, x):
        B, T, _ = x.shape
        x = self.input_fc(x)  # [B, T, D]
        cls_tokens = self.cls_token.repeat(B, 1, 1)  # [B, 1, D]
        x = torch.cat([cls_tokens, x], dim=1)  # [B, T+1, D]
        x = x.permute(1, 0, 2)  # Transformer expects [T, B, D]
        out = self.transformer(x)[0]  # take the [CLS] token's output
        return self.fc_out(out)

In [7]:
# ----------------------
# Train Model
# ----------------------
from sklearn.model_selection import train_test_split

def train_model(video_paths, labels):
    all_clips = []
    all_labels = []

    for i, video in enumerate(video_paths):
        _, keypoints = process_video(video, use_pose=True, frame_skip=3)
        pose_clips = create_clips(_, keypoints, sequence_length=SEQUENCE_LENGTH, stride=STRIDE)
        all_clips.extend(pose_clips)
        all_labels.extend([labels[i]] * len(pose_clips))

    X = np.stack(all_clips)  # shape: [N_clips, 32, 99]
    y = np.array(all_labels)

    # Split into train/val sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert to tensors
    X_train, y_train = torch.tensor(X_train).float(), torch.tensor(y_train).long()
    X_val, y_val = torch.tensor(X_val).float(), torch.tensor(y_val).long()

    # Create loaders
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=16, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=16)

    # Initialize Transformer model
    model = PoseTransformer(input_size=99, seq_len=SEQUENCE_LENGTH, num_classes=NUM_CLASSES,
                            d_model=128, nhead=4, num_layers=2)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    for epoch in range(20):
        model.train()
        running_loss = 0.0
        for xb, yb in train_loader:
            pred = model(xb)
            loss = criterion(pred, yb)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                pred = model(xb)
                loss = criterion(pred, yb)
                val_loss += loss.item()
                correct += (pred.argmax(1) == yb).sum().item()
                total += yb.size(0)

        scheduler.step()

        print(f"Epoch {epoch+1} | Train Loss: {running_loss/len(train_loader):.4f} | "
              f"Val Loss: {val_loss/len(val_loader):.4f} | Val Acc: {100*correct/total:.2f}%")

    return model

In [8]:

# ----------------------
# Predict on New Video
# ----------------------
def predict(video_path, model):
    _, keypoints = process_video(video_path, use_pose=True)
    pose_clips = create_clips(_, keypoints, sequence_length=SEQUENCE_LENGTH, stride=STRIDE)

    model.eval()
    with torch.no_grad():
        inputs = torch.tensor(pose_clips).float()
        outputs = model(inputs)
        avg_probs = torch.softmax(outputs, dim=1).mean(dim=0)
        pred_class = torch.argmax(avg_probs).item()

    return pred_class

In [9]:
import random

def get_labeled_video_paths(clip_dir='clips'):
    train_videos, train_labels = [], []
    test_videos, test_labels = [], []

    for label_name, label_value in [('ballet', 0), ('hiphop', 1)]:
        folder_path = os.path.join(clip_dir, label_name)
        video_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.mp4')])

        if len(video_files) < 2:
            raise ValueError(f"Not enough videos in {folder_path} to split into train/test.")

        random.shuffle(video_files)  # Shuffle to randomize test selection

        test_file = video_files.pop()  # Leave one for testing
        test_videos.append(os.path.join(folder_path, test_file))
        test_labels.append(label_value)

        for file in video_files:
            train_videos.append(os.path.join(folder_path, file))
            train_labels.append(label_value)

    return train_videos, train_labels, test_videos, test_labels


In [11]:
import os
train_videos, train_labels, test_videos, test_labels = get_labeled_video_paths()
model = train_model(train_videos, train_labels)



Epoch 1 | Train Loss: 0.5726 | Val Loss: 0.6206 | Val Acc: 59.17%
Epoch 2 | Train Loss: 0.5268 | Val Loss: 0.4758 | Val Acc: 76.94%
Epoch 3 | Train Loss: 0.5015 | Val Loss: 0.5128 | Val Acc: 76.94%
Epoch 4 | Train Loss: 0.4742 | Val Loss: 0.4654 | Val Acc: 81.11%
Epoch 5 | Train Loss: 0.4759 | Val Loss: 0.4602 | Val Acc: 84.17%
Epoch 6 | Train Loss: 0.4203 | Val Loss: 0.3852 | Val Acc: 84.44%
Epoch 7 | Train Loss: 0.4253 | Val Loss: 0.3813 | Val Acc: 84.17%
Epoch 8 | Train Loss: 0.4017 | Val Loss: 0.3623 | Val Acc: 85.83%
Epoch 9 | Train Loss: 0.3955 | Val Loss: 0.3753 | Val Acc: 86.94%
Epoch 10 | Train Loss: 0.3897 | Val Loss: 0.3678 | Val Acc: 85.83%
Epoch 11 | Train Loss: 0.3852 | Val Loss: 0.3580 | Val Acc: 86.39%
Epoch 12 | Train Loss: 0.3835 | Val Loss: 0.3505 | Val Acc: 86.67%
Epoch 13 | Train Loss: 0.3703 | Val Loss: 0.3237 | Val Acc: 87.50%
Epoch 14 | Train Loss: 0.3698 | Val Loss: 0.3426 | Val Acc: 87.22%
Epoch 15 | Train Loss: 0.3701 | Val Loss: 0.3226 | Val Acc: 88.06%
Epoc

In [12]:
print("\n🧪 Test Results:")
for i, video_path in enumerate(test_videos):
    pred = predict(video_path, model)
    print(f"Video: {os.path.basename(video_path)} | Actual: {test_labels[i]} | Predicted: {pred}")


🧪 Test Results:
Video: ballet_13_part1.mp4 | Actual: 0 | Predicted: 0
Video: hiphop_8_part5.mp4 | Actual: 1 | Predicted: 1
