# Dataset Handling Using Hugging Face Datasets

Load `fho_main.json`.

In [None]:
import json

with open("../../ego4d/v2/annotations/fho_main.json") as f:
    fho_main = json.load(f)

Create a dataset from all the narrated actions.

In [None]:
from datasets import Dataset

dataset = Dataset.from_list(
    [
        {"clip_uid": interval["clip_uid"], **action}
        for video in fho_main["videos"]
        for interval in video["annotated_intervals"]
        for action in interval["narrated_actions"]
    ]
)
dataset

Filter rejected, invalid and non-C actions.

In [None]:
import re

C_REGEX = re.compile(r"^\#C C", re.IGNORECASE)

dataset = dataset.filter(
    lambda is_rejected, is_valid_action, narration_text: not is_rejected
    and is_valid_action
    and C_REGEX.match(narration_text),
    input_columns=["is_rejected", "is_valid_action", "narration_text"],
)
print(f"After filtering actions: {len(dataset)}")

Remove unused columns.

In [None]:
dataset = dataset.remove_columns(
    [
        "warnings",
        "uid",
        "start_sec",
        "end_sec",
        "start_frame",
        "end_frame",
        "is_valid_action",
        "is_partial",
        "clip_start_frame",
        "clip_end_frame",
        "narration_timestamp_sec",
        "clip_narration_timestamp_sec",
        "narration_annotation_uid",
        "structured_verb",
        "freeform_verb",
        "state_transition",
        "critical_frames",
        "clip_critical_frames",
        "frames",
        "is_rejected",
        "is_invalid_annotation",
        "reject_reason",
        "stage",
    ]
)
dataset

Now, let's tokenize the prompt and `narration_text` into `input_ids` and `labels` columns respectively.

In [None]:
from functools import partial

from transformers import Blip2Processor

INSTR_PROMPT = "What is the camera wearer doing?"


def batch_tokenize(tokenizer, examples):
    return {
        "input_ids": tokenizer(
            [INSTR_PROMPT for _ in range(len(examples["narration_text"]))],
            return_attention_mask=False,
        ).input_ids,
        "labels": tokenizer(
            examples["narration_text"], return_attention_mask=False
        ).input_ids,
    }


processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
dataset = dataset.map(
    partial(batch_tokenize, processor.tokenizer),
    batched=True,
    remove_columns="narration_text",
)
print(dataset)
print(dataset[10])