# Evaluate EgoVideoBLIP2 on MILLY step detection data

In [None]:
import glob
import json
import os

import imageio.v3 as iio
import numpy as np
from IPython.display import HTML, display
from pytorchvideo.data.video import VideoPathHandler


def display_gif(video_tensor, gif_file_name):
    """Prepares and displays a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    iio.imwrite(
        gif_file_name,
        video_tensor.permute(0, 2, 3, 1).numpy().astype(np.uint8),
        extension=".gif",
        # infinite loop
        loop=0,
    )
    html = f'<img src="{gif_file_name}" />'
    display(HTML(html))


def load_milly_video_steps(video_dir_path):
    json_files = glob.glob(os.path.join(video_dir_path, "*.json"))
    assert len(json_files) == 1
    with open(json_files[0]) as f:
        annotation = json.load(f)

    video_path_handler = VideoPathHandler()
    video = video_path_handler.video_from_path(os.path.join(video_dir_path, "pv.mp4"))

    step_dict = {
        int(k): v for k, v in annotation["attribute"]["1"]["options"].items() if v != ""
    }
    step_list = [step_dict[i] for i in range(len(step_dict))]
    return video, step_list, annotation

Load an arbitrary coffee video from MILLY step detection data.

In [None]:
video, step_list, annotation = load_milly_video_steps(
    "../../MILLYCookbook_media_v007/z/dat/CookBook/"
    "MILLYCookbook_media_v007/B_coffee/HL2/10/video-0002"
)

# clean up step_list
# not sure what the best format would be, so let's just use the one that's
# closest to training
# step_list = [re.match(r'(\d+: )?(.+)', step).group(2) for step in step_list]
print(step_list)
step_list = [
    "the camera wearer starts",
    "the camera wearer boils water",
    "the camera wearer places dripper",
    "the camera wearer prepares filter and places filter",
    "the camera wearer measures beans; grinds beans; places grounds",
    "the camera wearer checks water",
    "the camera wearer blooms coffee",
    "the camera wearer pours water",
    "the camera wearer drains coffee and discards grounds",
    "the camera wearer ends",
]
print(step_list)

Load `ego-video-blip2-opt-2.7b-subsample-8`.

In [None]:
import torch
from transformers import Blip2Processor

from video_blip.model import VideoBlip2ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
pretrained = "../../checkpoints/ego-video-blip2/ego-video-blip2-opt-2.7b-subsample-8"
processor = Blip2Processor.from_pretrained(pretrained)
model = VideoBlip2ForConditionalGeneration.from_pretrained(pretrained).to(device)

Perform Video QA as a sanity check.

In [None]:
prompt = "Question: what is the camera wearer doing? Answer:"

for clip_id, annotated_clip in annotation["metadata"].items():
    start, end = annotated_clip["z"]
    # sample a frame every 30 frames, i.e., 1 FPS
    # (channel, time, height, width)
    frames = video.get_clip(start, end)["video"][:, ::30, ...]
    display_gif(frames.permute(1, 0, 2, 3), f"{clip_id}.gif")
    inputs = processor(
        images=frames.permute(1, 0, 2, 3), text=prompt, return_tensors="pt"
    ).to(device)
    inputs["pixel_values"] = inputs["pixel_values"].permute(1, 0, 2, 3).unsqueeze(0)
    print(f"inputs: {({k: v.size() for k, v in inputs.items()})}")
    with torch.no_grad():
        generated_ids = model.generate(**inputs)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[
        0
    ].strip()
    print(f"generated_text: {generated_text}")

Run evaluation.

In [None]:
import torch.nn.functional as F

from video_blip.data.utils import (
    DataCollatorForVideoSeq2Seq,
    generate_input_ids_and_labels,
)

collator = DataCollatorForVideoSeq2Seq(processor.tokenizer)
input_list = [
    generate_input_ids_and_labels(
        processor.tokenizer,
        "Question: What is the camera wearer doing? Answer:",
        text,
        model.config.use_decoder_only_language_model,
    )
    for text in step_list
]

for clip_id, annotated_clip in annotation["metadata"].items():
    start, end = annotated_clip["z"]
    # sample a frame every 30 frames, i.e., 1 FPS
    # (channel, time, height, width)
    clip = video.get_clip(start, end)["video"][:, ::30, ...]
    display_gif(clip.permute(1, 0, 2, 3), f"{clip_id}.gif")
    # process the clip
    clip = processor.image_processor(
        clip.permute(1, 0, 2, 3), return_tensors="pt"
    ).pixel_values.permute(1, 0, 2, 3)
    for item in input_list:
        item["pixel_values"] = clip
    inputs = collator(input_list)
    inputs.to(device)

    # calculate lengths of generated texts
    gen_lengths = torch.sum(inputs.labels != -100, dim=-1)

    # ignore eos token when calculating log probs
    inputs["labels"][inputs["labels"] == processor.tokenizer.eos_token_id] = -100

    with torch.no_grad():
        output = model(**inputs)
    log_probs = F.cross_entropy(
        output.logits.flatten(end_dim=1),
        inputs.labels.flatten(end_dim=1),
        reduction="none",
    )
    normalized_log_probs = -log_probs.view(len(step_list), -1).sum(dim=-1) / gen_lengths
    ground_truth_step = int(annotated_clip["av"]["1"])
    print(f"Ground-truth step: {ground_truth_step} - {step_list[ground_truth_step]}")
    predicted_step = normalized_log_probs.argmax().item()
    print(f"Predicted step: {predicted_step} - {step_list[predicted_step]}")
    # with the current format for the generated text,
    # this doesn't seem to matter as much.
    # predicted_step_wout_start_end = normalized_log_probs[1:-2].argmax().item() + 1
    # print(f'Predicted step w/out start, end: {predicted_step_wout_start_end}'
    # f' - {step_list[predicted_step_wout_start_end]}')
    for i, log_prob in enumerate(normalized_log_probs.tolist()):
        print(f"{log_prob:.2f}: {step_list[i]}")