In [1]:
import pandas as pd

In [3]:
def load_class_index(path):
    classes_df = pd.read_csv(path, header=None, names=["index", "label"], sep=" ")
    return {row["label"]: row["index"] - 1 for _, row in classes_df.iterrows()}


def load_split_file(file_path, base_dir="UCF-101"):
    labels_df = pd.read_csv(
        file_path,
        header=None,
        names=["file", "label_index"],
        sep=" ",
    )
    labels_df["label"] = labels_df["file"].apply(lambda x: x.split("/")[0])
    labels_df["file"] = labels_df["file"].apply(lambda x: f"{base_dir}/{x}")
    return [(file, label) for [file, label] in (labels_df[["file", "label"]]).to_numpy()]

class_map = load_class_index("ucfTrainTestlist/classInd.txt")
train_list = load_split_file("ucfTrainTestlist/trainlist01.txt")
train_list[:5] 

[('UCF-101/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi', 'ApplyEyeMakeup'),
 ('UCF-101/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c02.avi', 'ApplyEyeMakeup'),
 ('UCF-101/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c03.avi', 'ApplyEyeMakeup'),
 ('UCF-101/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c04.avi', 'ApplyEyeMakeup'),
 ('UCF-101/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c05.avi', 'ApplyEyeMakeup')]

In [None]:
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    CenterCrop
)

video_transform = Compose(
    [
        ApplyTransformToKey(
            "video",
            transform=Compose(
                [
                    UniformTemporalSubsample(20), # Uniformly sample 20 frames from the video
                    Lambda(lambda x: x / 255.0),  # Normalize pixel values to [0, 1]
                    Normalize([0.45, 0.45, 0.45], [0.225, 0.225, 0.225]), # Normalize using ImageNet stats
                    RandomShortSideScale(min_size=256, max_size=320), # Randomly scale the shorter side of the video
                    CenterCrop(224), # Center crop the video to 224x224
                ]
            ),
        ),
    ]
)

In [None]:
from pytorchvideo.data import labeled_video_dataset, make_clip_sampler

train_dataset = labeled_video_dataset(
    "UCF-101",
    clip_sampler=make_clip_sampler("random", 2),
    transform=video_transform,
    decode_audio=False,
)

In [None]:
from torch.utils.data import DataLoader

loader = DataLoader(train_dataset, batch_size=4, num_workers=4)

In [None]:
batch = next(iter(train_dataset))
batch.get('video').shape
batch.get('label')

tensor([[[[211., 211., 211.,  ..., 240., 242., 240.],
          [211., 211., 211.,  ..., 240., 242., 242.],
          [211., 211., 211.,  ..., 242., 243., 243.],
          ...,
          [188., 188., 188.,  ..., 123., 122., 122.],
          [187., 187., 187.,  ..., 123., 122., 122.],
          [187., 187., 187.,  ..., 123., 122., 122.]],

         [[209., 209., 209.,  ..., 242., 244., 243.],
          [209., 209., 209.,  ..., 242., 244., 244.],
          [209., 209., 209.,  ..., 243., 245., 245.],
          ...,
          [186., 186., 186.,  ..., 123., 122., 122.],
          [185., 185., 185.,  ..., 123., 122., 122.],
          [185., 185., 185.,  ..., 123., 122., 122.]],

         [[209., 209., 209.,  ..., 242., 244., 243.],
          [209., 209., 209.,  ..., 242., 244., 244.],
          [209., 209., 209.,  ..., 243., 245., 245.],
          ...,
          [186., 186., 186.,  ..., 123., 122., 122.],
          [185., 185., 185.,  ..., 123., 120., 120.],
          [185., 185., 185.,  ...

In [None]:
import torch

model_name = "slowfast_r50"
model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)
