# Load Ego4D FHO Clips Using PyTorchVideo

First load `fho_main.json`.

In [None]:
import json

with open("../../ego4d/v2/annotations/fho_main.json") as f:
    fho_main = json.load(f)

Let's pick an arbitrary action.

In [None]:
video = fho_main["videos"][20]
interval = video["annotated_intervals"][2]
action = interval["narrated_actions"][4]

print(f'video_uid: {video["video_uid"]}')
print(f'start_sec: {action["start_sec"]}')
print(f'end_sec: {action["end_sec"]}')
print(f'clip_uid: {interval["clip_uid"]}')
print(f'clip_start_sec: {action["clip_start_sec"]}')
print(f'clip_end_sec: {action["clip_end_sec"]}')
print(f'narration_text: {action["narration_text"]}')

Display the action from the full video.

In [None]:
from IPython.display import HTML, display


def display_video(kind, uid, start_sec, end_sec):
    html = f"""
        <video id="video-{kind}-{uid}" width="480" height="320" controls>
            <source src="../../ego4d/v2/{kind}/{uid}.mp4" type="video/mp4">
            Your browser does not support the video tag.
        </video>
        <script>
            var video = document.getElementById('video-{kind}-{uid}');
            video.currentTime = {start_sec};
            video.addEventListener('timeupdate', function() {{
                if (video.currentTime >= {end_sec}) {{
                    video.pause();
                }}
            }});
        </script>
        """

    display(HTML(html))


display_video("full_scale", video["video_uid"], action["start_sec"], action["end_sec"])

Display the action from the clip.

In [None]:
display_video(
    "clips", interval["clip_uid"], action["clip_start_sec"], action["clip_end_sec"]
)

They both mark the same action, so we can just use the clips, which are smaller and more wieldy.

Now let's load the clip, and extract the frames corresponding to the action.

In [None]:
from pytorchvideo.data.video import VideoPathHandler

video_path_handler = VideoPathHandler()

# First, load the video corresponding to the clip
video = video_path_handler.video_from_path(
    f"../../ego4d/v2/clips/{interval['clip_uid']}.mp4"
)

# Now extract the clip corresponding to the action
clip = video.get_clip(action["clip_start_sec"], action["clip_end_sec"])

# frame tensor for the action
# the action is 8 seconds, and the clip is 30fps, so 240 frames are extracted.
# (C, T, H, W)
print(clip["video"].size())

# audio
print(clip["audio"])

Preprocess the video using `BlipImageProcessor`.

In [None]:
from pytorchvideo.transforms import UniformTemporalSubsample
from torchvision.transforms import Compose
from transformers import Blip2Processor

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")

transforms = Compose([UniformTemporalSubsample(8)])

frames = transforms(clip["video"])

# print image processor options
print(processor.image_processor)

# treat the time dimension as the batch dimension
processed_frames = processor.image_processor(
    frames.permute(1, 0, 2, 3), return_tensors="pt"
)["pixel_values"]

# (T, C, H, W)
print(f"processed_frames.size(): {processed_frames.size()}")

Let's take a look as a gif. Note that the colors will look all wrong.

In [None]:
# Adopted from https://huggingface.co/docs/transformers/v4.28.1/en/tasks/video_classification#visualize-the-preprocessed-video-for-better-debugging # noqa
import imageio
import numpy as np
from IPython.display import Image


def create_gif(video_tensor, filename="sample.gif"):
    """Prepares a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    frames = []
    for video_frame in video_tensor:
        frames.append((video_frame.permute(1, 2, 0).numpy()).astype(np.uint8))
    imageio.mimsave(filename, frames, "GIF")
    return filename


def display_gif(video_tensor, gif_name="sample.gif"):
    """Prepares and displays a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)


display_gif(processed_frames)