# Held-out Verb and Noun Analyses

## Bar chart for actions (structured verb, structured noun)

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline
import json
from collections import Counter, defaultdict


def truncate_label(label, max_length=10):
    return label if len(label) <= max_length else label[:max_length] + "..."


def read_narrations(fname):
    narrations = defaultdict(list)
    with open(fname) as f:
        for line in f:
            data = json.loads(line)
            narrations[(data["structured_verb"], data["structured_noun"])].append(data)
    return narrations


narrations_by_action_eilev_blip2_opt_27b = [
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "0-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "1-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "2-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "4-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "8-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "12-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "16-shot-with-in-context.jsonl"
    ),
]

narrations_by_action_eilev_blip2_flan_t5_xl = [
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "0-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "1-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "2-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "4-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "8-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "12-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "16-shot-with-in-context.jsonl"
    ),
]

struct_verb_noun_counter = Counter(
    {
        action: len(narrations)
        for action, narrations in narrations_by_action_eilev_blip2_opt_27b[0].items()
    }
)

for i, (pair, count) in enumerate(struct_verb_noun_counter.most_common()):
    print(f"{pair}: {count}")
    if i == 5:
        break

labels, values = zip(
    *[(pair, count) for pair, count in struct_verb_noun_counter.most_common()]
)
truncated_labels = [
    f"({truncate_label(verb), truncate_label(noun)})" for verb, noun in labels
]

plt.figure(figsize=(20, 7))
bars = plt.bar(truncated_labels, values)

plt.xlabel("(verb, noun)")
plt.ylabel("Count")
plt.title("(verb, noun) Count")

# Sparse labeling: Show every nth label
n = 200  # adjust this based on your data and preferences
sparse_labels = [
    "" if i % n != 0 else label for i, label in enumerate(truncated_labels)
]
plt.xticks(range(len(labels)), sparse_labels, rotation=45, ha="right", fontsize=10)

# Adjust x-axis limits to remove margins
plt.xlim(-0.5, len(labels) - 0.5)

# Display the count on top of each bar
for i, bar in enumerate(bars):
    if i % n == 0:
        yval = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            yval + 0.5,
            str(yval),
            ha="center",
            va="bottom",
            fontsize=9,
        )

# Display the plot
plt.tight_layout()  # Adjust layout for better visibility
plt.show()


## Graphs for actions with at least 12 in-context examples

In [None]:
def draw_graphs(actions, metrics, fig_name, shots, legend_ncol):
    # Plotting
    fig, axes = plt.subplots(
        len(metrics), 1, figsize=(24, 8 * len(metrics)), sharex=True
    )

    # Iterate through each metric to create a subplot
    lines = []
    for i, metric in enumerate(metrics):
        ax = axes[i]
        ax.tick_params(axis="both", which="major", labelsize=28)
        for (verb, noun), data in actions.items():
            (line,) = ax.plot(
                data["meta"]["shots"],
                data[metric][: len(data["meta"]["shots"])],
                label=f"({verb}, {noun})",
                linestyle=data["meta"]["linestyle"],
                linewidth=8,
                marker="D",
                markersize=16,
            )
            lines.append(line)
        ax.set_ylabel(metric, fontsize=32, fontweight="bold")
        ax.set_xticks(shots)
        ax.set_xlim(min(shots), max(shots))
        ax.grid(True)

    fig.legend(
        lines,
        [f"({verb}, {noun})" for verb, noun in actions],
        loc="lower center",
        bbox_to_anchor=(0.5, 1),
        fontsize=32,
        ncol=legend_ncol,
        handlelength=3,
    )
    fig.text(0.5, 0.0, "Shots", ha="center", va="top", fontsize=32, fontweight="bold")

    # Adjust layout to prevent overlap
    plt.tight_layout()
    # bbox_inches="tight" ensures that all the visible content
    # is saved into the pdf file.
    plt.savefig(fig_name, bbox_inches="tight")
    plt.show()


## Top 5 "Common" Rare Actions

### EILEV-BLIP-2-OPT-2.7B

In [None]:
import numpy as np


def get_action_scores(action_names, narrations_by_action):
    return {
        action_name[0]: {
            "STS-CE": [
                np.mean(
                    [
                        float(narration["sts_cross_encoder_score"])
                        for narration in narrations[action_name[0]]
                    ]
                )
                for narrations in narrations_by_action
            ],
            "STS-BE": [
                np.mean(
                    [
                        float(narration["sts_bi_encoder_cos_sim"])
                        for narration in narrations[action_name[0]]
                    ]
                )
                for narrations in narrations_by_action
            ],
            "meta": {"shots": shots, "linestyle": "-"},
        }
        for action_name in action_names
    }


shots = [0, 1, 2, 4, 8, 12, 16]
action_names = struct_verb_noun_counter.most_common()[:5]
actions = get_action_scores(action_names, narrations_by_action_eilev_blip2_opt_27b)
metrics = ["STS-CE", "STS-BE"]
draw_graphs(
    actions, metrics, "top-5-common-rare-actions-eilev-blip2-opt-2.7b.pdf", shots, 1
)


### EILEV-BLIP-2-Flan-T5-xl

In [None]:
actions = get_action_scores(action_names, narrations_by_action_eilev_blip2_flan_t5_xl)
draw_graphs(
    actions, metrics, "top-5-common-rare-actions-eilev-blip2-flan-t5-xl.pdf", shots, 1
)


## Bottom 5 "Common" Rare Actions

### EILEV-BLIP-2-OPT-2.7B

In [None]:
action_names = [
    ((verb, noun), count)
    for (verb, noun), count in struct_verb_noun_counter.most_common()
    if verb != "[other]"
][-5:]
actions = get_action_scores(action_names, narrations_by_action_eilev_blip2_opt_27b)
metrics = ["STS-CE", "STS-BE"]
draw_graphs(
    actions, metrics, "bottom-5-common-rare-actions-eilev-blip2-opt-2.7b.pdf", shots, 1
)


### EILEV-BLIP-2-Flan-T5-xl

In [None]:
actions = get_action_scores(action_names, narrations_by_action_eilev_blip2_flan_t5_xl)
draw_graphs(
    actions,
    metrics,
    "bottom-5-common-rare-actions-eilev-blip2-flan-t5-xl.pdf",
    shots,
    1,
)
