# Held-out Verb and Noun Analyses

## Bar chart for actions (structured verb, structured noun)

In [None]:
import json
from collections import Counter, defaultdict

import matplotlib.pyplot as plt

%matplotlib inline


def truncate_label(label, max_length=10):
    return label if len(label) <= max_length else label[:max_length] + "..."


def read_narrations(fname):
    narrations = defaultdict(list)
    with open(fname) as f:
        for line in f:
            data = json.loads(line)
            narrations[(data["structured_verb"], data["structured_noun"])].append(data)
    return narrations


narrations_by_action_eilev_blip2_opt_27b = [
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "0-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "4-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "8-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "12-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-opt-2.7b-ego4d-held-out-verb-noun-0.8-"
        "16-shot-with-in-context.jsonl"
    ),
]

narrations_by_action_eilev_blip2_flan_t5_xl = [
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "0-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "4-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "8-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "12-shot-with-in-context.jsonl"
    ),
    read_narrations(
        "../eilev-blip2-flan-t5-xl-ego4d-held-out-verb-noun-0.8-"
        "16-shot-with-in-context.jsonl"
    ),
]

struct_verb_noun_counter = Counter(
    {
        action: len(narrations)
        for action, narrations in narrations_by_action_eilev_blip2_opt_27b[0].items()
    }
)

for i, (pair, count) in enumerate(struct_verb_noun_counter.most_common()):
    print(f"{pair}: {count}")
    if i == 5:
        break

labels, values = zip(
    *[(pair, count) for pair, count in struct_verb_noun_counter.most_common()]
)
truncated_labels = [
    f"({truncate_label(verb), truncate_label(noun)})" for verb, noun in labels
]

plt.figure(figsize=(20, 7))
bars = plt.bar(truncated_labels, values)

plt.xlabel("(verb, noun)")
plt.ylabel("Count")
plt.title("(verb, noun) Count")

# Sparse labeling: Show every nth label
n = 200  # adjust this based on your data and preferences
sparse_labels = [
    "" if i % n != 0 else label for i, label in enumerate(truncated_labels)
]
plt.xticks(range(len(labels)), sparse_labels, rotation=45, ha="right", fontsize=10)

# Adjust x-axis limits to remove margins
plt.xlim(-0.5, len(labels) - 0.5)

# Display the count on top of each bar
for i, bar in enumerate(bars):
    if i % n == 0:
        yval = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            yval + 0.5,
            str(yval),
            ha="center",
            va="bottom",
            fontsize=9,
        )

# Display the plot
plt.tight_layout()  # Adjust layout for better visibility
plt.show()

## Graphs for actions with at least 12 in-context examples

In [None]:
def draw_graphs(actions, metrics, shots, legend_ncol, fig_name=None):
    # Plotting
    fig, axes = plt.subplots(
        len(metrics), 1, figsize=(24, 8 * len(metrics)), sharex=True
    )

    # Iterate through each metric to create a subplot
    lines = []
    for i, metric in enumerate(metrics):
        ax = axes[i]
        ax.tick_params(axis="both", which="major", labelsize=28)
        for (verb, noun), data in actions.items():
            (line,) = ax.plot(
                data["meta"]["shots"],
                data[metric][: len(data["meta"]["shots"])],
                label=f"({verb}, {noun})",
                linestyle=data["meta"]["linestyle"],
                linewidth=8,
                marker="D",
                markersize=16,
            )
            lines.append(line)
        ax.set_ylabel(metric, fontsize=32, fontweight="bold")
        ax.set_xticks(shots)
        ax.set_xlim(min(shots), max(shots))
        ax.grid(True)

    fig.legend(
        lines,
        [f"({verb}, {noun})" for verb, noun in actions],
        loc="lower center",
        bbox_to_anchor=(0.5, 1),
        fontsize=32,
        ncol=legend_ncol,
        handlelength=3,
    )
    fig.text(0.5, 0.0, "Shots", ha="center", va="top", fontsize=32, fontweight="bold")

    # Adjust layout to prevent overlap
    plt.tight_layout()
    if fig_name is not None:
        # bbox_inches="tight" ensures that all the visible content
        # is saved into the pdf file.
        plt.savefig(fig_name, bbox_inches="tight")
    plt.show()

## Top 5 "Common" Rare Actions

### EILEV-BLIP-2-OPT-2.7B

In [None]:
import numpy as np


def get_action_scores(action_names, narrations_by_action):
    return {
        action_name: {
            "STS-CE": [
                np.mean(
                    [
                        float(narration["sts_cross_encoder_score"])
                        for narration in narrations[action_name]
                    ]
                )
                for narrations in narrations_by_action
            ],
            "STS-BE": [
                np.mean(
                    [
                        float(narration["sts_bi_encoder_cos_sim"])
                        for narration in narrations[action_name]
                    ]
                )
                for narrations in narrations_by_action
            ],
            "meta": {"shots": shots, "linestyle": "-"},
        }
        for action_name in action_names
    }


shots = [0, 4, 8, 12, 16]
action_names = [action for action, _ in struct_verb_noun_counter.most_common()][:5]
actions = get_action_scores(action_names, narrations_by_action_eilev_blip2_opt_27b)
metrics = ["STS-CE", "STS-BE"]
draw_graphs(actions, metrics, shots, 1)

### EILEV-BLIP-2-Flan-T5-xl

In [None]:
actions = get_action_scores(action_names, narrations_by_action_eilev_blip2_flan_t5_xl)
draw_graphs(actions, metrics, shots, 1)

## Bottom 5 "Common" Rare Actions

### EILEV-BLIP-2-OPT-2.7B

In [None]:
action_names = [
    (verb, noun)
    for (verb, noun), _ in struct_verb_noun_counter.most_common()
    if verb != "[other]"
][-5:]
actions = get_action_scores(action_names, narrations_by_action_eilev_blip2_opt_27b)
metrics = ["STS-CE", "STS-BE"]
draw_graphs(actions, metrics, shots, 1)

### EILEV-BLIP-2-Flan-T5-xl

In [None]:
actions = get_action_scores(action_names, narrations_by_action_eilev_blip2_flan_t5_xl)
draw_graphs(actions, metrics, shots, 1)

## Verb/Noun Frequency in the Training Data vs Delta Model Performance Across Shots

In [None]:
import csv
from decimal import Decimal

import seaborn as sns
import statsmodels.api as sm

train_verb_freq = Counter()
train_noun_freq = Counter()
with open("../../ego4d/splits/held_out_verb_noun-0.8/train.csv", newline="") as f:
    csvreader = csv.DictReader(f)
    for row in csvreader:
        train_verb_freq[row["structured_verb"]] += 1
        train_noun_freq[row["structured_noun"]] += 1


def draw_scatter_plots(verb_freq, noun_freq, action_scores_per_model, metric, fig_name):
    sns.set_theme(style="darkgrid")
    plt.rcParams["font.family"] = "stixgeneral"
    fig, axs = plt.subplots(
        1,
        len(action_scores_per_model) * 2,
        figsize=(10 * len(action_scores_per_model) * 2, 10),
    )

    def get_freq_metric_per_action(freq, action_scores, verb):
        idx = 0 if verb else 1
        freq_per_action = []
        metric_per_action = []
        for action, scores in action_scores.items():
            if freq[action[idx]] == 0:
                continue
            freq_per_action.append(freq[action[idx]])
            # calculate the difference between last shot and first shot
            metric_per_action.append(scores[metric][-1] - scores[metric][0])
        return np.log(freq_per_action), np.array(metric_per_action)

    def fit_regression(x, y):
        x = sm.add_constant(x)  # Adds a constant term to the predictor
        model = sm.OLS(y, x).fit()
        return model.rsquared, model.pvalues[1]  # R-squared and p-value for the slope

    line_kws = {"color": "r", "linewidth": 8}
    for i, (model_name, action_scores) in enumerate(action_scores_per_model.items()):
        verb_freq_per_action, verb_metric_per_action = get_freq_metric_per_action(
            verb_freq, action_scores, True
        )
        r_squared, _ = fit_regression(verb_freq_per_action, verb_metric_per_action)
        index = i * 2
        sns.regplot(
            x=verb_freq_per_action,
            y=verb_metric_per_action,
            ax=axs[index],
            line_kws=line_kws,
        )
        # Rasterize the scatter plot in order to reduce the file size and loading time
        axs[index].collections[0].set_rasterized(True)
        axs[index].tick_params(axis="both", which="major", labelsize=32)
        axs[index].set_xlabel(
            "Log Verb Class Freq in Training Data", fontsize=38, fontweight="bold"
        )
        if i == 0:
            axs[index].set_ylabel(
                f"Δ{metric} from 16 to 0-shot", fontsize=38, fontweight="bold"
            )
        axs[index].set_title(model_name, fontsize=38, fontweight="bold")
        axs[index].text(
            0.05,
            0.95,
            f"$R^2$: {Decimal(r_squared):.2e}",
            transform=axs[index].transAxes,  # Position based on axis coordinates
            verticalalignment="top",  # Align the text to the top
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white"),
            fontsize=38,
            fontweight="bold",
        )

        noun_freq_per_action, noun_metric_per_action = get_freq_metric_per_action(
            noun_freq, action_scores, False
        )
        r_squared, _ = fit_regression(verb_freq_per_action, verb_metric_per_action)
        sns.regplot(
            x=noun_freq_per_action,
            y=noun_metric_per_action,
            ax=axs[index + 1],
            line_kws=line_kws,
        )
        # Rasterize the scatter plot in order to reduce the file size and loading time
        axs[index + 1].collections[0].set_rasterized(True)
        axs[index + 1].tick_params(axis="both", which="major", labelsize=32)
        axs[index + 1].set_xlabel(
            "Log Noun Class Freq in Training Data", fontsize=38, fontweight="bold"
        )
        axs[index + 1].set_title(model_name, fontsize=38, fontweight="bold")
        axs[index + 1].text(
            0.05,
            0.95,
            f"$R^2$: {Decimal(r_squared):.2e}",
            transform=axs[index + 1].transAxes,  # Position based on axis coordinates
            verticalalignment="top",  # Align the text to the top
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white"),
            fontsize=38,
            fontweight="bold",
        )
    plt.tight_layout()
    # bbox_inches="tight" ensures that all the visible content
    # is saved into the pdf file.
    plt.savefig(fig_name, bbox_inches="tight")
    plt.show()

In [None]:
action_scores_per_model = {
    "EILEV BLIP-2 OPT-2.7B": get_action_scores(
        struct_verb_noun_counter.keys(), narrations_by_action_eilev_blip2_opt_27b
    ),
    "EILEV BLIP-2 Flan-T5-xl": get_action_scores(
        struct_verb_noun_counter.keys(), narrations_by_action_eilev_blip2_flan_t5_xl
    ),
}

draw_scatter_plots(
    train_verb_freq,
    train_noun_freq,
    action_scores_per_model,
    "STS-BE",
    "freq-delta-sts-be.pdf",
)

## Action Frequency Bar Chart

In [None]:
import csv


def count_actions(*files):
    counter = Counter()
    for file in files:
        with open(file, newline="") as f:
            csvreader = csv.DictReader(f)
            for item in csvreader:
                counter[(item["structured_verb"], item["structured_noun"])] += 1
    return counter


def truncate_action_label(action):
    verb, noun = action
    return f"{verb.split('_', 1)[0]}\n{noun.split('_', 1)[0]}"


def get_bar_labels_values(counter):
    return zip(
        *[
            (truncate_action_label(action), count)
            for action, count in counter.most_common()
        ]
    )


common_actions_counter = count_actions(
    "../../ego4d/splits/held_out_verb_noun-0.8/train.csv",
    "../../ego4d/splits/held_out_verb_noun-0.8/val.csv",
)
rare_actions_counter = count_actions(
    "../../ego4d/splits/held_out_verb_noun-0.8/test.csv"
)

# sanity check. make sure the fewest common action is still
# more numerous than the most common rare action
assert (
    common_actions_counter.most_common()[-1][1]
    >= rare_actions_counter.most_common()[0][1]
)


def draw_freq_graph(common_actions_counter, rare_actions_counter, fig_name):
    sns.set(style="darkgrid")
    plt.rcParams["font.family"] = "stixgeneral"
    plt.figure(figsize=(20, 16))
    common_action_labels, common_action_counts = get_bar_labels_values(
        common_actions_counter
    )
    rare_action_labels, rare_action_counts = get_bar_labels_values(rare_actions_counter)
    all_labels = common_action_labels + rare_action_labels
    all_counts = common_action_counts + rare_action_counts

    ax = sns.lineplot(x=range(len(all_labels)), y=all_counts, linewidth=8)

    plt.yscale("log")

    # Fill the area under the line
    plt.fill_between(range(len(all_labels)), all_counts, alpha=0.3)

    # mark the cut-off for common actions
    plt.axvline(
        x=len(common_actions_counter) - 0.5,
        color="red",
        linestyle="--",
        linewidth=8,
        label="80% Threshold",
    )
    plt.legend(prop=dict(weight="bold", size=42))

    plt.xlim(0, len(all_labels))
    xticks, _ = plt.xticks()
    plt.tick_params(axis="both", which="major", labelsize=36)
    plt.xticks(xticks[:-1], [all_labels[int(xtick)] for xtick in xticks[:-1]])
    plt.xlabel("Action (Verb, Noun)", fontsize=42, fontweight="bold", labelpad=20)
    plt.ylabel("Frequency", fontsize=42, fontweight="bold")

    ax.text(
        0.025,
        0.95,
        "Common",
        bbox=dict(boxstyle="round,pad=0.3", facecolor="white"),
        fontsize=42,
        fontweight="bold",
        verticalalignment="top",
        transform=ax.transAxes,
    )
    ax.text(
        0.205,
        0.95,
        "Rare",
        bbox=dict(boxstyle="round,pad=0.3", facecolor="white"),
        fontsize=42,
        fontweight="bold",
        verticalalignment="top",
        transform=ax.transAxes,
    )

    plt.tight_layout()
    # bbox_inches="tight" ensures that all the visible content
    # is saved into the pdf file.
    plt.savefig(fig_name, bbox_inches="tight")
    plt.show()

In [None]:
draw_freq_graph(common_actions_counter, rare_actions_counter, "action-freq.pdf")