# `fho_main.json` Data Exploration

In [None]:
# required imports
import json
import math
import re
import string

import matplotlib.pyplot as plt

%matplotlib inline


First load `fho_main.json`.

In [None]:
with open("../../../ego4d/v2/annotations/fho_main.json") as f:
    fho_main = json.load(f)


Let's verify if `is_rejected` is correlated to `is_valid_action`.

In [None]:
for video in fho_main["videos"]:
    for interval in video["annotated_intervals"]:
        for action in interval["narrated_actions"]:
            if action["is_valid_action"] == action["is_rejected"]:
                # if we're here, it means either 1. it's a valid action but
                # not rejected, or 2. it's not a valid action but it's accepted.
                print(f'is_valid_action: {action["is_valid_action"]}')
                print(f'is_rejected: {action["is_rejected"]}')
                print(f'uid: {action["uid"]}')
                print(f'narration_text: {action["narration_text"]}')
                print(f'video_uid: {video["video_uid"]}')
                print(f'clip_uid: {interval["clip_uid"]}')
                print("========================================================")


How about actions that are not rejected, prefixed by `#C` but not valid?

In [None]:
for video in fho_main["videos"]:
    for interval in video["annotated_intervals"]:
        for action in interval["narrated_actions"]:
            if (
                not action["is_rejected"]
                and action["narration_text"].startswith("#C")
                and not action["is_valid_action"]
            ):
                print(f'is_valid_action: {action["is_valid_action"]}')
                print(f'is_rejected: {action["is_rejected"]}')
                print(f'uid: {action["uid"]}')
                print(f'narration_text: {action["narration_text"]}')
                print(f'video_uid: {video["video_uid"]}')
                print(f'clip_uid: {interval["clip_uid"]}')
                print("========================================================")


Is `(start_sec, end_sec)` same as `(clip_start_sec, clip_end_sec)`?

In [None]:
for video in fho_main["videos"]:
    for interval in video["annotated_intervals"]:
        for action in interval["narrated_actions"]:
            if not math.isclose(
                action["start_sec"], action["clip_start_sec"], rel_tol=1e-5
            ) or not math.isclose(
                action["end_sec"], action["clip_end_sec"], rel_tol=1e-5
            ):
                print(f'start_sec: {action["start_sec"]}')
                print(f'clip_start_sec: {action["clip_start_sec"]}')
                print(f'end_sec: {action["end_sec"]}')
                print(f'clip_end_sec: {action["clip_end_sec"]}')
                print(f'uid: {action["uid"]}')
                print(f'narration_text: {action["narration_text"]}')
                print(f'video_uid: {video["video_uid"]}')
                print(f'clip_uid: {interval["clip_uid"]}')
                print("========================================================")


`(start_sec, end_sec)` and `(clip_start_sec, clip_end_sec)` are not the same. The former denotes the times from the full video, while the latter denotes the times from clips.

How long are action clips?

In [None]:
action_times = []
for video in fho_main["videos"]:
    for interval in video["annotated_intervals"]:
        for action in interval["narrated_actions"]:
            action_time = action["clip_end_sec"] - action["clip_start_sec"]
            if action_time < 3:
                print(f'start_frame: {action["start_frame"]}')
                print(f'end_frame: {action["end_frame"]}')
                print(f'uid: {action["uid"]}')
                print(f'narration_text: {action["narration_text"]}')
                print(f'video_uid: {video["video_uid"]}')
                print(f'clip_uid: {interval["clip_uid"]}')
                print("========================================================")
            action_times.append(action_time)

n, bins, patches = plt.hist(action_times)

# Annotate the frequency above each bar
for i in range(len(n)):
    plt.annotate(
        f"{n[i]:.0f}",
        xy=((bins[i] + bins[i + 1]) / 2, n[i]),
        xytext=(0, 5),
        textcoords="offset points",
        ha="center",
        va="bottom",
    )

# Add labels and a title
plt.xlabel("Seconds")
plt.ylabel("Number of Actions")
plt.title("Action Duration")

# Display the plot
plt.show()


How long are actual actions?

In [None]:
import csv
from fractions import Fraction

csvfile = open("no_critical_frames.csv", "w", newline="")
csv_writer = csv.writer(csvfile)
csv_writer.writerow(
    ["video_uid", "action_index", "start_sec", "end_sec", "narration_text"]
)
action_times = []
for video in fho_main["videos"]:
    for interval in video["annotated_intervals"]:
        for i, action in enumerate(interval["narrated_actions"]):
            if action["critical_frames"] is None:
                action_times.append(-1)
                csv_writer.writerow(
                    [
                        video["video_uid"],
                        i,
                        action["start_sec"],
                        action["end_sec"],
                        action["narration_text"],
                    ]
                )
                continue
            start = Fraction(
                action["critical_frames"]["pre_frame"],
                int(video["video_metadata"]["fps"]),
            )
            end = Fraction(
                action["critical_frames"]["post_frame"],
                int(video["video_metadata"]["fps"]),
            )
            action_time = float(end - start)
            action_times.append(action_time)

csvfile.close()

n, bins, patches = plt.hist(action_times)

# Annotate the frequency above each bar
for i in range(len(n)):
    plt.annotate(
        f"{n[i]:.0f}",
        xy=((bins[i] + bins[i + 1]) / 2, n[i]),
        xytext=(0, 5),
        textcoords="offset points",
        ha="center",
        va="bottom",
    )

# Add labels and a title
plt.xlabel("Seconds")
plt.ylabel("Number of Actions")
plt.title("Action Duration")

# Display the plot
plt.show()


Do all `narrated_text`s end with a punctuation?

In [None]:
from eilev.data.utils import clean_narration_text

punc_counts = {p: 0 for p in string.punctuation}
punc_counts["OTHERS"] = 0
punc_counts["EMPTY"] = 0
for video in fho_main["videos"]:
    for interval in video["annotated_intervals"]:
        for action in interval["narrated_actions"]:
            cleaned = clean_narration_text(action["narration_text"])
            if len(cleaned) == 0:
                punc_counts["EMPTY"] += 1
                continue
            last_char = cleaned[-1]
            if last_char in punc_counts:
                punc_counts[last_char] += 1
            else:
                punc_counts["OTHERS"] += 1

filtered_counts = {k: v for k, v in punc_counts.items() if v > 0}

# Create a bar graph
bars = plt.bar(filtered_counts.keys(), filtered_counts.values())

# Add labels and a title
plt.xlabel("Punctuation")
plt.ylabel("Frequency")
plt.title("Frequency of Sentence Endings with Punctuation")

# Annotate the count above each bar
for bar in bars:
    plt.annotate(
        f"{bar.get_height():.0f}",
        xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
        xytext=(0, 3),
        textcoords="offset points",
        ha="center",
        va="bottom",
    )

# Display the plot
plt.show()


Any `#summary`s?

In [None]:
SUMMARY_REGEX = re.compile(r"\#summary", re.IGNORECASE)
for video in fho_main["videos"]:
    for interval in video["annotated_intervals"]:
        for action in interval["narrated_actions"]:
            if SUMMARY_REGEX.search(action["narration_text"]):
                print(f'uid: {action["uid"]}')
                print(f'narration_text: {action["narration_text"]}')
                print(f'video_uid: {video["video_uid"]}')
                print(f'clip_uid: {interval["clip_uid"]}')
                print("========================================================")


Is `#unsure` always at the end?

In [None]:
UNSURE_REGEX = re.compile(r"\#unsure", re.IGNORECASE)
ENDS_WITH_UNSURE_REGEX = re.compile(r"\#unsure$", re.IGNORECASE)
for video in fho_main["videos"]:
    for interval in video["annotated_intervals"]:
        for action in interval["narrated_actions"]:
            if UNSURE_REGEX.search(
                action["narration_text"]
            ) and not ENDS_WITH_UNSURE_REGEX.search(action["narration_text"].strip()):
                print(f'uid: {action["uid"]}')
                print(f'narration_text: {action["narration_text"]}<|eos|>')
                print(f'video_uid: {video["video_uid"]}')
                print(f'clip_uid: {interval["clip_uid"]}')
                print("========================================================")


What's the relationship between `fho_hands_test_unannotated.json` and `fho_main.json`?

In [None]:
with open("../../ego4d/v2/annotations/fho_hands_test_unannotated.json") as f:
    fho_hands_test_unannotated = json.load(f)

video_uids_test_unannotated = set(
    clip["video_uid"] for clip in fho_hands_test_unannotated["clips"]
)
video_uids_main = set(video["video_uid"] for video in fho_main["videos"])

print(f"len(video_uids_test_unannotated) = {len(video_uids_test_unannotated)}")
print(f"len(video_uids_main) = {len(video_uids_main)}")
print(
    "len(video_uids_test_unannotated.intersection(video_uids_main)) = "
    f"{len(video_uids_test_unannotated.intersection(video_uids_main))}"
)

# some frames in test unannotated don't have pre frames, so use pre_45
pre_45_test_unannotated = set(
    (clip["video_uid"], frame["pre_45"]["frame"])
    for clip in fho_hands_test_unannotated["clips"]
    for frame in clip["frames"]
)
pre_45_main = set(
    (video["video_uid"], action["critical_frames"]["pre_45"])
    for video in fho_main["videos"]
    for interval in video["annotated_intervals"]
    for action in interval["narrated_actions"]
    if action["critical_frames"] is not None
)

print(f"len(pre_frame_test_unannotated) = {len(pre_45_test_unannotated )}")
print(f"len(pre_frame_main) = {len(pre_45_main)}")
print(
    "len(pre_frame_test_unannotated.intersection(pre_frame_main)) = "
    f"{len(pre_45_test_unannotated.intersection(pre_45_main))}"
)


List actions without critical frames.

In [None]:
C_REGEX = re.compile(r"^\#C\s+C", re.IGNORECASE)

for video in fho_main["videos"]:
    for interval in video["annotated_intervals"]:
        for action in interval["narrated_actions"]:
            if (
                action["critical_frames"] is None
                and not action["is_rejected"]
                and action["is_valid_action"]
                and C_REGEX.match(action["narration_text"]) is not None
            ):
                print(
                    f'{video["video_uid"]}: ({action["start_sec"], action["end_sec"]})'
                    f', {action["narration_text"]}'
                )
