In [1]:
# Install core ML libraries
!pip install --quiet transformers evaluate scikit-learn av

# Install PyTorchVideo straight from GitHub (avoids PyPI build issues)
!pip install --quiet git+https://github.com/facebookresearch/pytorchvideo.git


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9.0.13 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system =

In [3]:
import os, tarfile
from pathlib import Path
from huggingface_hub import hf_hub_download

# Download UCF101 subset (pre-packaged for easy loading)
repo_id = "sayakpaul/ucf101-subset"
file_name = "UCF101_subset.tar.gz"
tar_path = hf_hub_download(repo_id=repo_id, filename=file_name, repo_type="dataset")
data_root = Path("data/ucf101")
with tarfile.open(tar_path) as tar:
    tar.extractall(path=data_root)

# The extracted folder should contain 'train' and 'test' subdirectories
train_dir = data_root / "UCF101_subset" / "train"
test_dir  = data_root / "UCF101_subset" / "test"

# Define which classes we consider as "cyberbullying"
target_classes = {
    "BaseballPitch",# hitting a punching bag (proxy for boxing)
    "BenchPress",     # hitting a speed bag
    "BasketballDunk",              # person punching something
    "Basketball"       # wrestling
}

# Verify these classes exist in the train directory
classes = sorted([d.name for d in train_dir.iterdir() if d.is_dir()])
print("Classes in dataset (sample):", classes[:10])
for cls in target_classes:
    if cls not in classes:
        raise ValueError(f"Class {cls} not found in dataset classes")

# Map class names to indices (assuming UCF101 loader uses sorted order)
class_to_idx = {c: i for i, c in enumerate(classes)}
target_indices = {class_to_idx[c] for c in target_classes}

print("Binary label mapping: {} classes labeled as bullying".format(len(target_indices)))


UCF101_subset.tar.gz:   0%|          | 0.00/171M [00:00<?, ?B/s]

Classes in dataset (sample): ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress']
Binary label mapping: 4 classes labeled as bullying


In [4]:
import torch
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification

# Load pretrained VideoMAE model and image processor
model_ckpt = "MCG-NJU/videomae-base"
image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
label2id = {"not_bullying": 0, "cyberbullying": 1}
id2label = {v: k for k, v in label2id.items()}
model = VideoMAEForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=label2id, id2label=id2label,
    ignore_mismatched_sizes=True
)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


2025-05-18 17:18:54.542083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747588734.777787      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747588734.855229      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/377M [00:00<?, ?B/s]

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VideoMAEForVideoClassification(
  (videomae): VideoMAEModel(
    (embeddings): VideoMAEEmbeddings(
      (patch_embeddings): VideoMAEPatchEmbeddings(
        (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
      )
    )
    (encoder): VideoMAEEncoder(
      (layer): ModuleList(
        (0-11): 12 x VideoMAELayer(
          (attention): VideoMAEAttention(
            (attention): VideoMAESelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
            )
            (output): VideoMAESelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): VideoMAEIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
    

In [5]:
from pytorchvideo.transforms import (
    ApplyTransformToKey, Normalize, RandomShortSideScale,
    UniformTemporalSubsample
)
from torchvision.transforms import Compose, Lambda, RandomCrop, RandomHorizontalFlip, Resize

mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width  = image_processor.size["width"]
resize_to = (height, width)

num_frames = model.config.num_frames  # typically 16
sample_rate = 4
fps = 30
clip_duration = num_frames * sample_rate / fps

# Training transforms: sampling + augmentation
train_transform = Compose([
    ApplyTransformToKey(
        key="video",
        transform=Compose([
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            Normalize(mean, std),
            RandomShortSideScale(min_size=256, max_size=320),
            RandomCrop(resize_to),
            RandomHorizontalFlip(p=0.5),
        ])
    )
])

# Validation transforms: uniform sampling, no augmentation
val_transform = Compose([
    ApplyTransformToKey(
        key="video",
        transform=Compose([
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            Normalize(mean, std),
            Resize(resize_to),
        ])
    )
])


In [11]:
import torch
from torch.utils.data import DataLoader, IterableDataset
from pytorchvideo.data.ucf101 import Ucf101
from pytorchvideo.data.clip_sampling import RandomClipSampler, UniformClipSampler
from pytorchvideo.transforms import ApplyTransformToKey, Normalize, UniformTemporalSubsample
from torchvision.transforms import Compose, Lambda, RandomHorizontalFlip, CenterCrop, RandomResizedCrop


# 2️⃣ Video-only transforms
video_train_transform = Compose([
    UniformTemporalSubsample(num_frames),
    Lambda(lambda x: x / 255.0),
    RandomResizedCrop((224, 224)),
    RandomHorizontalFlip(p=0.5),
    Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
])
video_val_transform = Compose([
    UniformTemporalSubsample(num_frames),
    Lambda(lambda x: x / 255.0),
    CenterCrop((224, 224)),
    Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
])
train_transform = ApplyTransformToKey(key="video", transform=video_train_transform)
val_transform   = ApplyTransformToKey(key="video", transform=video_val_transform)

# 3️⃣ Paths
train_dir = "/kaggle/working/data/ucf101/UCF101_subset/train"
test_dir  = "/kaggle/working/data/ucf101/UCF101_subset/test"

# 4️⃣ Base datasets
train_base = Ucf101(data_path=train_dir,
                    clip_sampler=RandomClipSampler(clip_duration),
                    decode_audio=False,
                    transform=train_transform)
val_base   = Ucf101(data_path=test_dir,
                    clip_sampler=UniformClipSampler(clip_duration),
                    decode_audio=False,
                    transform=val_transform)

# 5️⃣ Iterable wrapper: convert int label → torch.tensor(0/1)
class BinaryUCF101(IterableDataset):
    def __init__(self, base_ds, target_idxs):
        super().__init__()
        self.base = base_ds
        self.target_idxs = target_idxs

    def __iter__(self):
        for sample in self.base:
            orig_label = sample["label"]          # this is already an int
            sample["label"] = torch.tensor(
                1 if orig_label in self.target_idxs else 0
            )
            yield sample

# 6️⃣ Instantiate binary datasets & dataloaders
train_ds = BinaryUCF101(train_base, target_indices)
val_ds   = BinaryUCF101(val_base,   target_indices)

train_loader = DataLoader(train_ds, batch_size=4, num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=4, num_workers=2)


In [12]:
batch = next(iter(train_loader))
print("Video batch shape:", batch["video"].shape)   # expected (B, T, C, H, W)
print("Labels:", batch["label"], type(batch["label"][0]))


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7954f56811c0>
Exception ignored in: Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7954f56811c0>
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Traceback (most recent call last):
      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
self._shutdown_workers()    
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
self._shutdown_workers()
    if w.is_alive():  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

      if w.is_alive():   
     ^ ^ ^^^^ ^ ^^^^^^^^^^^
^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
^    ^assert self._parent_pid == os.getpid(), 'can only test a child process'^
^ ^ 
  File "/usr/lib/pyth

Video batch shape: torch.Size([4, 3, 16, 224, 224])
Labels: tensor([0, 0, 1, 1]) <class 'torch.Tensor'>


In [13]:
import torch.nn as nn
from torch.optim import AdamW
from sklearn.metrics import f1_score, accuracy_score

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    train_preds, train_labels = [], []
    for batch in train_loader:
        videos = batch["video"].to(device)          # shape: (B, C, T, H, W)
        labels = batch["label"].to(device)          # shape: (B,)
        optimizer.zero_grad()
        outputs = model(pixel_values=videos, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        # Collect predictions
        logits = outputs.logits.detach().cpu()
        preds = torch.argmax(logits, dim=1).numpy()
        train_preds.extend(preds.tolist())
        train_labels.extend(labels.cpu().numpy().tolist())
    # Compute training metrics
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1  = f1_score(train_labels, train_preds)
    
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            videos = batch["video"].to(device)
            labels = batch["label"].to(device)
            outputs = model(pixel_values=videos)
            logits = outputs.logits.cpu()
            preds = torch.argmax(logits, dim=1).numpy()
            val_preds.extend(preds.tolist())
            val_labels.extend(labels.cpu().numpy().tolist())
    val_acc = accuracy_score(val_labels, val_preds)
    val_f1  = f1_score(val_labels, val_preds)

    print(f"Epoch {epoch+1}/{num_epochs}: "
          f"Train Acc={train_acc:.3f}, F1={train_f1:.3f}; "
          f"Val Acc={val_acc:.3f}, F1={val_f1:.3f}")


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7954f56811c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
     Exception ignored in:   <function _MultiProcessingDataLoaderIter.__del__ at 0x7954f56811c0>^
^Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^^    ^self._shutdown_workers()^
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
^    ^if w.is_alive():^
^ ^ 
   File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
      assert self._parent_pid == os.getpid(), 'can only test a child process' 
  ^ ^^^  ^ ^^  ^^ ^ ^^ 
 ^  File "/us

ValueError: Make sure that the channel dimension of the pixel values match with the one set in the configuration.

In [14]:
import torch.nn as nn
from torch.optim import AdamW
from sklearn.metrics import f1_score, accuracy_score

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    train_preds, train_labels = [], []
    for batch in train_loader:
        # permute from (B, C, T, H, W) → (B, T, C, H, W)
        videos = batch["video"].permute(0, 2, 1, 3, 4).to(device)
        labels = batch["label"].to(device)
        optimizer.zero_grad()
        outputs = model(pixel_values=videos, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        # Collect predictions
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        train_preds.extend(preds.tolist())
        train_labels.extend(labels.cpu().numpy().tolist())

    train_acc = accuracy_score(train_labels, train_preds)
    train_f1  = f1_score(train_labels, train_preds)

    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            videos = batch["video"].permute(0, 2, 1, 3, 4).to(device)
            labels = batch["label"].to(device)
            outputs = model(pixel_values=videos)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            val_preds.extend(preds.tolist())
            val_labels.extend(labels.cpu().numpy().tolist())

    val_acc = accuracy_score(val_labels, val_preds)
    val_f1  = f1_score(val_labels, val_preds)

    print(
        f"Epoch {epoch+1}/{num_epochs}: "
        f"Train Acc={train_acc:.3f}, F1={train_f1:.3f}; "
        f"Val Acc={val_acc:.3f}, F1={val_f1:.3f}"
    )


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7954f56811c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Exception ignored in:     <function _MultiProcessingDataLoaderIter.__del__ at 0x7954f56811c0>self._shutdown_workers()

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        if w.is_alive():self._shutdown_workers()
 
   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
       if w.is_alive(): 
  ^ ^ ^ ^  ^ ^^^^^^^^^^^^^
^^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
^    ^assert self._parent_pid == os.getpid(), 'can only test a child process'^
^ 
   File "/usr/lib/pyth

Epoch 1/3: Train Acc=0.637, F1=0.493; Val Acc=0.742, F1=0.630
Epoch 2/3: Train Acc=0.793, F1=0.721; Val Acc=0.684, F1=0.675
Epoch 3/3: Train Acc=0.917, F1=0.897; Val Acc=0.813, F1=0.603


In [15]:
from sklearn.metrics import accuracy_score, f1_score

# 1. Create a test DataLoader exactly like train/val
test_base = Ucf101(
    data_path="/kaggle/working/data/ucf101/UCF101_subset/test",
    clip_sampler=UniformClipSampler(clip_duration),
    decode_audio=False,
    transform=val_transform,        # same as validation
)
test_ds = BinaryUCF101(test_base, target_indices)
test_loader = DataLoader(test_ds, batch_size=4, num_workers=2)

# 2. Run inference on test set
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        videos = batch["video"].permute(0, 2, 1, 3, 4).to(device)
        labels = batch["label"].to(device)
        logits = model(pixel_values=videos).logits
        preds = logits.argmax(dim=-1).cpu().numpy()
        test_preds.extend(preds.tolist())
        test_labels.extend(labels.cpu().numpy().tolist())

# 3. Compute metrics
test_acc = accuracy_score(test_labels, test_preds)
test_f1  = f1_score(test_labels, test_preds)
print(f"Test  Acc={test_acc:.3f}  |  Test F1={test_f1:.3f}")


Test  Acc=0.813  |  Test F1=0.603
