# Video QA with InstructBLIP

Load the clip for an arbitrary action from Ego4d.

In [None]:
import json

from pytorchvideo.data.video import VideoPathHandler

with open("../../ego4d/v2/annotations/fho_main.json") as f:
    fho_main = json.load(f)

video = fho_main["videos"][20]
interval = video["annotated_intervals"][2]
action = interval["narrated_actions"][4]

print(f'video_uid: {video["video_uid"]}')
print(f'start_sec: {action["start_sec"]}')
print(f'end_sec: {action["end_sec"]}')
print(f'clip_uid: {interval["clip_uid"]}')
print(f'clip_start_sec: {action["clip_start_sec"]}')
print(f'clip_end_sec: {action["clip_end_sec"]}')
print(f'narration_text: {action["narration_text"]}')

video_path_handler = VideoPathHandler()
video = video_path_handler.video_from_path(
    f"../../ego4d/v2/clips/{interval['clip_uid']}.mp4"
)
clip = video.get_clip(action["clip_start_sec"], action["clip_end_sec"])

Load `blip2_vicuna_instruct:vicuna7b`.

In [None]:
import torch
from lavis.common.registry import registry
from lavis.models import load_preprocess
from omegaconf import OmegaConf


def load_lavis_model_and_preprocess(
    name: str, model_type: str, is_eval: bool = False, device: str = "cpu", **kwargs
):
    model_cls = registry.get_model_class(name)
    cfg = OmegaConf.load(model_cls.default_config_path(model_type))
    model_cfg = cfg.model
    model_cfg.update(**kwargs)
    model = model_cls.from_config(model_cfg)
    if is_eval:
        model.eval()
    if device == "cpu" or device == torch.device("cpu"):
        model = model.float()
    model = model.to(device)

    vis_processors, txt_processors = load_preprocess(cfg.preprocess)

    # HACK: delete ToTensor() transform b/c VideoPathHandler already gives us
    # tensors.
    for _, vis_processor in vis_processors.items():
        del vis_processor.transform.transforms[-2]

    return model, vis_processors, txt_processors


model, vis_processors, _ = load_lavis_model_and_preprocess(
    "blip2_vicuna_instruct",
    "vicuna7b",
    is_eval=True,
    device="cuda",
    llm_model="/path/to/vicuna-7b-v1.1",
)

Process the video and show as a gif.

In [None]:
import imageio.v3 as iio
import numpy as np
from IPython.display import HTML, display


def display_gif(video_tensor, gif_file_name):
    """Prepares and displays a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    iio.imwrite(
        gif_file_name,
        video_tensor.permute(0, 2, 3, 1).numpy().astype(np.uint8),
        extension=".gif",
        # infinite loop
        loop=0,
    )
    html = f'<img src="{gif_file_name}" />'
    display(HTML(html))


frames = clip["video"][:, ::30, ...]
channel, time, _, _ = frames.size()
frames = frames.permute(1, 0, 2, 3)
frames = vis_processors["eval"](frames)
_, _, height, weight = frames.size()
frames = frames.view(time, channel, height, weight)
display_gif(frames, "vicuna_frames.gif")
frames = frames.permute(1, 0, 2, 3).unsqueeze(0).to(model.device)

Perform video QA.

In [None]:
generated_txt = model.generate(
    {"image": frames, "prompt": "What is the camera wearer doing?"}
)[0]
print(generated_txt)