# Video QA with Video Blip2

Load the clip for an arbitrary action from Ego4d.

In [None]:
import json

from pytorchvideo.data.video import VideoPathHandler

with open("../../ego4d/v2/annotations/fho_main.json") as f:
    fho_main = json.load(f)

video = fho_main["videos"][20]
interval = video["annotated_intervals"][2]
action = interval["narrated_actions"][4]

print(f'video_uid: {video["video_uid"]}')
print(f'start_sec: {action["start_sec"]}')
print(f'end_sec: {action["end_sec"]}')
print(f'clip_uid: {interval["clip_uid"]}')
print(f'clip_start_sec: {action["clip_start_sec"]}')
print(f'clip_end_sec: {action["clip_end_sec"]}')
print(f'narration_text: {action["narration_text"]}')

video_path_handler = VideoPathHandler()
video = video_path_handler.video_from_path(
    f"../../ego4d/v2/clips/{interval['clip_uid']}.mp4"
)
clip = video.get_clip(action["clip_start_sec"], action["clip_end_sec"])

Load `Salesforce/blip2-opt-2.7b`.

In [None]:
import sys

sys.path.append("../../")
import torch
from transformers import Blip2Processor

from video_blip2 import VideoBlip2ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = VideoBlip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b"
).to(device)

Perform Video QA without frame subsampling.

In [None]:
prompt = "Question: what is the camera wearer doing? Answer:"
inputs = processor(
    images=clip["video"].permute(1, 0, 2, 3), text=prompt, return_tensors="pt"
).to(device)
inputs["pixel_values"] = inputs["pixel_values"].permute(1, 0, 2, 3).unsqueeze(0)
print(f"inputs: {({k: v.size() for k, v in inputs.items()})}")
generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[
    0
].strip()
print(f"generated_text: {generated_text}")

Now with uniform temporal frame subsampling.

In [None]:
from pytorchvideo.transforms import UniformTemporalSubsample

# sample 8 frames uniformly across time
subsampler = UniformTemporalSubsample(8)
frames = subsampler(clip["video"])

inputs = processor(
    images=frames.permute(1, 0, 2, 3), text=prompt, return_tensors="pt"
).to(device)
inputs["pixel_values"] = inputs["pixel_values"].permute(1, 0, 2, 3).unsqueeze(0)
print(f"inputs: {({k: v.size() for k, v in inputs.items()})}")
generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[
    0
].strip()
print(f"generated_text: {generated_text}")