# Skewed Distributions Figures

In [None]:
import csv
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

def count_actions(*files):
    counter = Counter()
    for file in files:
        with open(file, newline="") as f:
            csvreader = csv.DictReader(f)
            for item in csvreader:
                counter[(item["structured_verb"], item["structured_noun"])] += 1
    return counter


def truncate_action_label(action):
    verb, noun = action
    return f"{verb.split('_', 1)[0]}\n{noun.split('_', 1)[0]}"


def get_bar_labels_values(counter):
    return zip(
        *[
            (truncate_action_label(action), count)
            for action, count in counter.most_common()
        ]
    )


def draw_freq_graph(actions_counter, cutoffs, fig_name):
    sns.set_theme(style="darkgrid")
    plt.figure(figsize=(30, 11))
    labels, counts = get_bar_labels_values(actions_counter)

    sns.lineplot(x=range(len(labels)), y=counts, linewidth=8)

    plt.yscale("log")

    # Fill the area under the line
    plt.fill_between(range(len(labels)), counts, alpha=0.3)

    # mark the cut-off for common actions
    for cutoff, label, color in cutoffs:
        plt.axvline(
            x=cutoff - 0.5, color=color, linestyle="--", linewidth=8, label=label
        )
    plt.legend(prop=dict(weight="bold", size=72))

    plt.xlim(0, len(labels))
    xticks, _ = plt.xticks()
    plt.tick_params(axis="both", which="major", labelsize=36)
    plt.xticks(xticks[:-1], [labels[int(xtick)] for xtick in xticks[:-1]])
    plt.xlabel("Action (Verb, Noun)", fontsize=42, fontweight="bold", labelpad=20)
    plt.ylabel("Frequency", fontsize=42, fontweight="bold")

    plt.tight_layout()
    # bbox_inches="tight" ensures that all the visible content
    # is saved into the pdf file.
    plt.savefig(fig_name, bbox_inches="tight")
    plt.show()

In [None]:
actions_counter = count_actions(
    "../../ego4d/splits/held_out_verb_noun-0.8/train.csv",
    "../../ego4d/splits/held_out_verb_noun-0.8/val.csv",
)

In [None]:
draw_freq_graph(
    actions_counter,
    (
        (len(actions_counter) - 2, "EILeV", "green"),
        (100, "Top 100 (Ablation)", "red"),
        (500, "Top 500 (Ablation)", "blue"),
    ),
    "skewed-dist.pdf",
)