# Test Ego4dFHOClipDataset

In [None]:
import random

import imageio.v3 as iio
import numpy as np
from IPython.display import Image

# helpful functions


def draw_random_items(dataset, num):
    return [dataset[idx] for idx in random.sample(list(range(len(dataset))), num)]


def display_gif(video_tensor, gif_file_name):
    """Prepares and displays a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_channels, num_frames, height, width).
    """
    iio.imwrite(
        gif_file_name,
        video_tensor.permute(1, 2, 3, 0).numpy().astype(np.uint8),
        extension=".gif",
        # infinite loop
        loop=0,
    )
    return Image(gif_file_name)

In [None]:
from pytorchvideo.transforms import ApplyTransformToKey, UniformTemporalSubsample
from torchvision.transforms import Compose, RandomHorizontalFlip, RandomRotation
from transformers import Blip2Processor

annotation_path = "../../ego4d/v2/annotations/fho_main.json"
clip_path = "../../ego4d/v2/clips/"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
transform = Compose(
    [
        ApplyTransformToKey(
            "pixel_values",
            Compose(
                [
                    UniformTemporalSubsample(8),
                    RandomHorizontalFlip(),
                    RandomRotation((-45, 45)),
                ]
            ),
        )
    ]
)

Randomly draw 3 items from a dataset for decoder only LMs.

In [None]:
import sys

sys.path.append("../../")

from IPython.display import display

from ego4d_dataset import Ego4dFHOClipDataset

dataset = Ego4dFHOClipDataset(
    annotation_path,
    clip_path,
    processor,
    use_decoder_only_language_model=True,
    transform=transform,
)

for i, item in enumerate(draw_random_items(dataset, 3)):
    print(f"input_ids: {item['input_ids']}")
    print(f"labels: {item.get('labels')}")
    display(display_gif(item["pixel_values"], f"decoder_only_lm_{i}.gif"))

Randomly draw 3 items from a dataset for seq2seq LMs.

In [None]:
dataset = Ego4dFHOClipDataset(
    annotation_path,
    clip_path,
    processor,
    use_decoder_only_language_model=False,
    transform=transform,
)

for i, item in enumerate(draw_random_items(dataset, 3)):
    print(f"input_ids: {item['input_ids']}")
    print(f"labels: {item['labels']}")
    display(display_gif(item["pixel_values"], f"seq2seq_lm_{i}.gif"))