# In-Context Learning Evaluation Figures

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

shots = [0, 1, 2, 4, 8, 12, 16]
metrics = ["STS-BE", "ROUGE-L"]


def draw_graphs(model_data, models, metrics, fig_name, shots, legend_ncol):
    sns.set_theme(style="darkgrid")
    plt.rcParams["font.family"] = "Caladea"
    fig, axes = plt.subplots(
        len(metrics), 1, figsize=(24, 8 * len(metrics)), sharex=True
    )

    # Iterate through each metric to create a subplot
    for i, metric in enumerate(metrics):
        if len(metrics) > 1:
            ax = axes[i]
        else:
            ax = axes
        ax.tick_params(axis="both", which="major", labelsize=48)
        for model_name in models:
            data = model_data[model_name]
            kwargs = {}
            if "zorder" in data["meta"]:
                kwargs["zorder"] = data["meta"]["zorder"]
            label = f"{data['meta']['name'] if 'name' in data['meta'] else model_name}"
            sns.lineplot(
                x=[str(shot) for shot in data["meta"]["shots"]],
                y=data[metric][: len(data["meta"]["shots"])],
                ax=ax,
                label=label,
                linestyle=data["meta"]["linestyle"],
                linewidth=12,
                marker="D",
                markersize=20,
                **kwargs,
            )
        ax.set_ylabel(metric, fontsize=54, fontweight="bold")
        ax.set_xticks(ticks=range(len(shots)), labels=[str(shot) for shot in shots])
        ax.grid(True)
        ax.get_legend().remove()

    handles, labels = ax.get_legend_handles_labels()
    fig.legend(
        handles,
        labels,
        loc="lower center",
        bbox_to_anchor=(0.5, 1),
        fontsize=48,
        ncol=legend_ncol,
        columnspacing=0.8,
    )
    fig.text(0.5, 0.0, "Shots", ha="center", va="top", fontsize=54, fontweight="bold")

    # Adjust layout to prevent overlap
    plt.tight_layout()
    # bbox_inches="tight" ensures that all the visible content
    # is saved into the pdf file.
    plt.savefig(fig_name, bbox_inches="tight")
    plt.show()

## Held-Out Evaluation (Ego4D)

In [None]:
model_data = {
    "EILEV BLIP-2 OPT-2.7B": {
        "STS-CE": [0.2098, 0.4754, 0.4897, 0.5569, 0.612, 0.6312, 0.6363],
        "STS-BE": [0.3278, 0.5495, 0.571, 0.6284, 0.6735, 0.6898, 0.6936],
        "BERTScore-F1": [0.5234, 0.6305, 0.6399, 0.6463, 0.6543, 0.6539, 0.6529],
        "ROUGE-L": [0.2315, 0.5013, 0.5396, 0.5785, 0.6102, 0.6249, 0.6296],
        "BLEU": [0.008795, 0.1376, 0.2015, 0.2443, 0.2741, 0.2968, 0.3049],
        "meta": {"shots": shots, "linestyle": "-"},
    },
    "EILEV BLIP-2 Flan-T5-xl": {
        "STS-CE": [0.3552, 0.5039, 0.5176, 0.5539, 0.6089, 0.6276, 0.6349],
        "STS-BE": [0.426, 0.5697, 0.5812, 0.613, 0.6689, 0.6886, 0.6948],
        "BERTScore-F1": [-1.84, 0.6291, 0.6394, 0.6477, 0.6527, 0.6561, 0.6572],
        "ROUGE-L": [0.3129, 0.5032, 0.5322, 0.5648, 0.607, 0.6203, 0.623],
        "BLEU": [0.06718, 0.1507, 0.1992, 0.2373, 0.2834, 0.2931, 0.2913],
        "meta": {"shots": shots, "linestyle": "-"},
    },
    "FT BLIP-2 OPT-2.7B": {
        "STS-CE": [0.4719, 0.4743, 0.4316, 0.4454, 0.4557, 0.4699, 0.4749],
        "STS-BE": [0.5313, 0.5434, 0.5148, 0.5266, 0.5332, 0.5432, 0.5482],
        "BERTScore-F1": [0.6432, 0.6435, 0.6456, 0.6464, 0.6446, 0.652, 0.6368],
        "ROUGE-L": [0.5368, 0.5139, 0.519, 0.5243, 0.5205, 0.5319, 0.5151],
        "BLEU": [0.1683, 0.1564, 0.1923, 0.1952, 0.1852, 0.1947, 0.1696],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "FT BLIP-2 Flan-T5-xl": {
        "STS-CE": [0.4849, 0.4755, 0.4776, 0.4933, 0.4934, 0.4641, 0.3997],
        "STS-BE": [0.5486, 0.5432, 0.5502, 0.5565, 0.5489, 0.5152, 0.444],
        "BERTScore-F1": [0.6602, 0.64, 0.645, 0.6362, 0.3001, -1.083, -4.232],
        "ROUGE-L": [0.5404, 0.5037, 0.5242, 0.5303, 0.5261, 0.4947, 0.4253],
        "BLEU": [0.188, 0.155, 0.1941, 0.1923, 0.1907, 0.1791, 0.1473],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "BLIP-2 OPT-2.7B": {
        "STS-CE": [0.1505, 0.3586, 0.4012, 0.432, 0.4575, 0.4469, 0.3809],
        "STS-BE": [0.3094, 0.4233, 0.4862, 0.5222, 0.5486, 0.5422, 0.4755],
        "BERTScore-F1": [-1.716, -5.206, -0.9048, 0.3596, 0.5919, 0.5613, -0.315],
        "ROUGE-L": [0.09565, 0.3769, 0.4612, 0.5006, 0.5204, 0.5019, 0.4271],
        "BLEU": [0.003134, 0.1139, 0.1639, 0.1828, 0.1926, 0.1766, 0.1203],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "BLIP-2 Flan-T5-xl": {
        "STS-CE": [0.1327, 0.4551, 0.4371, 0.4532, 0.476, 0.4812, 0.5006],
        "STS-BE": [0.2861, 0.5235, 0.4877, 0.502, 0.5315, 0.545, 0.5681],
        "BERTScore-F1": [0.3864, 0.576, 0.5802, 0.5833, 0.5829, 0.5792, 0.5765],
        "ROUGE-L": [0.0598, 0.4606, 0.4291, 0.4403, 0.4684, 0.4846, 0.5056],
        "BLEU": [0.0005683, 0.1405, 0.1188, 0.1258, 0.1609, 0.1771, 0.2052],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "Kosmos-2": {
        "STS-CE": [0.3261, 0.3052, 0.17],
        "STS-BE": [0.4014, 0.3732, 0.2999],
        "BERTScore-F1": [0.4971, 0.364, 0.4123],
        "ROUGE-L": [0.3258, 0.176, 0.1376],
        "BLEU": [0.03476, 0.005267, 0.005217],
        "meta": {"shots": [0, 1, 2], "linestyle": "--"},
    },
    "Otter": {
        "STS-CE": [0.3514, 0.324, 0.2897, 0.271, 0.2836, 0.2912, 0.2961],
        "STS-BE": [0.4074, 0.417, 0.3905, 0.3758, 0.371, 0.3682, 0.364],
        "BERTScore-F1": [0.3534, 0.3165, 0.3044, 0.2895, 0.2884, 0.289, 0.2808],
        "ROUGE-L": [0.2555, 0.2337, 0.2234, 0.2038, 0.1906, 0.1902, 0.1846],
        "BLEU": [
            0.003195,
            0.001461,
            0.001018,
            0.001041,
            0.0008764,
            0.0004788,
            0.0005822,
        ],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "VideoMAE-base": {
        "STS-CE": [0.3799],
        "STS-BE": [0.479],
        "BERTScore-F1": [0.6516],
        "ROUGE-L": [0.4375],
        "BLEU": [0.0987],
        "meta": {"shots": [0], "linestyle": "--"},
    },
    "VideoMAE-huge": {
        "STS-CE": [0.4405],
        "STS-BE": [0.5282],
        "BERTScore-F1": [0.645],
        "ROUGE-L": [0.4509],
        "BLEU": [0.1071],
        "meta": {"shots": [0], "linestyle": "--", "name": "VideoMAE", "zorder": 10},
    },
}
models = [
    "EILEV BLIP-2 OPT-2.7B",
    "EILEV BLIP-2 Flan-T5-xl",
    "VideoMAE-huge",
    "BLIP-2 OPT-2.7B",
    "BLIP-2 Flan-T5-xl",
    "FT BLIP-2 OPT-2.7B",
    "FT BLIP-2 Flan-T5-xl",
]

draw_graphs(model_data, models, metrics, "held-out-eval.pdf", shots, 3)

## Out-of-Distribution Evaluation (EPIC-KITCHENS-100)

In [None]:
model_data = {
    "EILEV BLIP-2 OPT-2.7B": {
        "STS-CE": [0.2151, 0.6128, 0.6099, 0.7724, 0.8081, 0.819, 0.8234],
        "STS-BE": [0.3605, 0.7313, 0.7119, 0.841, 0.8642, 0.8675, 0.8704],
        "BERTScore-F1": [0.4697, 0.678, 0.679, 0.6873, 0.6904, 0.6915, 0.694],
        "ROUGE-L": [0.204, 0.724, 0.7549, 0.8483, 0.8665, 0.8714, 0.8725],
        "BLEU": [0.01428, 0.2566, 0.4666, 0.6352, 0.6608, 0.6645, 0.6677],
        "meta": {"shots": shots, "linestyle": "-"},
    },
    "EILEV BLIP-2 Flan-T5-xl": {
        "STS-CE": [0.3134, 0.5512, 0.565, 0.6396, 0.7813, 0.8251, 0.8399],
        "STS-BE": [0.3829, 0.6612, 0.6767, 0.7455, 0.8512, 0.8815, 0.8916],
        "BERTScore-F1": [-0.7901, 0.6835, 0.686, 0.6995, 0.6915, 0.6922],
        "ROUGE-L": [0.3009, 0.6629, 0.7167, 0.7788, 0.8583, 0.8813, 0.8904],
        "BLEU": [0.03337, 0.2287, 0.4043, 0.5166, 0.6673, 0.7125, 0.7325],
        "meta": {"shots": shots, "linestyle": "-"},
    },
    "FT BLIP-2 OPT-2.7B": {
        "STS-CE": [0.4719, 0.4743, 0.4316, 0.4454, 0.4557, 0.4699, 0.4749],
        "STS-BE": [0.5313, 0.5434, 0.5148, 0.5266, 0.5332, 0.5432, 0.5482],
        "BERTScore-F1": [0.6432, 0.6435, 0.6456, 0.6464, 0.6446, 0.652, 0.6368],
        "ROUGE-L": [0.5368, 0.5139, 0.519, 0.5243, 0.5205, 0.5319, 0.5151],
        "BLEU": [0.1683, 0.1564, 0.1923, 0.1952, 0.1852, 0.1947, 0.1696],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "FT BLIP-2 Flan-T5-xl": {
        "STS-CE": [0.4849, 0.4755, 0.4776, 0.4933, 0.4934, 0.4641, 0.3997],
        "STS-BE": [0.5486, 0.5432, 0.5502, 0.5565, 0.5489, 0.5152, 0.444],
        "BERTScore-F1": [0.6602, 0.645, 0.64, 0.645, 0.6362, 0.3001, -1.083, -4.232],
        "ROUGE-L": [0.5404, 0.5037, 0.5242, 0.5303, 0.5261, 0.4947, 0.4253],
        "BLEU": [0.188, 0.155, 0.1941, 0.1923, 0.1907, 0.1791, 0.1473],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "BLIP-2 OPT-2.7B": {
        "STS-CE": [0.08659, 0.6325, 0.541, 0.5713, 0.6197, 0.545, 0.4651],
        "STS-BE": [0.2806, 0.7269, 0.6322, 0.6676, 0.7203, 0.6479, 0.5928],
        "BERTScore-F1": [0.3099, -0.6529, 0.4143, 0.673, 0.6883, 0.6381, 0.4607],
        "ROUGE-L": [0.04052, 0.7076, 0.6738, 0.7125, 0.7676, 0.6322, 0.5593],
        "BLEU": [0.001951, 0.3479, 0.4054, 0.4363, 0.5027, 0.2774, 0.1263],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "BLIP-2 Flan-T5-xl": {
        "STS-CE": [0.08193, 0.5044, 0.4155, 0.4518, 0.5141, 0.5509, 0.5766],
        "STS-BE": [0.2809, 0.5937, 0.487, 0.5317, 0.6066, 0.6589, 0.6941],
        "BERTScore-F1": [0.3219, 0.5882, 0.5754, 0.5959, 0.6237, 0.6296, 0.6124],
        "ROUGE-L": [0.02271, 0.5629, 0.4685, 0.5268, 0.6235, 0.6853, 0.723],
        "BLEU": [0.0, 0.1956, 0.1762, 0.2238, 0.3021, 0.3599, 0.3958],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "Kosmos-2": {
        "STS-CE": [0.3879, 0.2909, 0.1445],
        "STS-BE": [0.4214, 0.3846, 0.2756],
        "BERTScore-F1": [0.555, 0.4582, 0.4502],
        "ROUGE-L": [0.4424, 0.2526, 0.1448],
        "BLEU": [0.04952, 0.007954, 0.004721],
        "meta": {"shots": [0, 1, 2], "linestyle": "--"},
    },
    "Otter": {
        "STS-CE": [0.3787, 0.3698, 0.357, 0.3353, 0.3211, 0.3286, 0.3102],
        "STS-BE": [0.416, 0.4589, 0.4496, 0.4199, 0.3777, 0.3592, 0.3446],
        "BERTScore-F1": [0.3554, 0.3554, 0.355, 0.3112, 0.2854, 0.2754, 0.2293],
        "ROUGE-L": [0.2318, 0.2513, 0.2499, 0.2164, 0.1827, 0.1731, 0.1497],
        "BLEU": [0.001755, 0.0005584, 0.0, 0.0004919, 0.0, 0.0003501, 0.000305],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "VideoMAE-base": {
        "STS-CE": [0.3356],
        "STS-BE": [0.4407],
        "BERTScore-F1": [0.6934],
        "ROUGE-L": [0.491],
        "BLEU": [0.1953],
        "meta": {"shots": [0], "linestyle": "--"},
    },
    "VideoMAE-huge": {
        "STS-CE": [0.4078],
        "STS-BE": [0.5108],
        "BERTScore-F1": [0.6883],
        "ROUGE-L": [0.5062],
        "BLEU": [0.2096],
        "meta": {"shots": [0], "linestyle": "--", "name": "VideoMAE"},
    },
}
models = [
    "EILEV BLIP-2 OPT-2.7B",
    "EILEV BLIP-2 Flan-T5-xl",
    "VideoMAE-huge",
    "BLIP-2 OPT-2.7B",
    "BLIP-2 Flan-T5-xl",
    "FT BLIP-2 OPT-2.7B",
    "FT BLIP-2 Flan-T5-xl",
]

draw_graphs(model_data, models, metrics, "out-of-dist-eval.pdf", shots, 3)

## Novel, Rare Verb/Noun Pair Evaluation

In [None]:
model_data = {
    "EILEV BLIP-2 OPT-2.7B": {
        "STS-CE": [0.222, 0.5117, 0.51, 0.5939, 0.6602, 0.6775, 0.6814],
        "STS-BE": [0.3672, 0.5864, 0.5978, 0.6727, 0.7311, 0.7466, 0.7512],
        "BERTScore-F1": [0.5451, 0.6229, 0.6323, 0.639, 0.6444, 0.6449, 0.6448],
        "ROUGE-L": [0.1916, 0.5245, 0.5601, 0.6069, 0.6424, 0.6547, 0.6606],
        "BLEU": [0.01493, 0.1183, 0.2028, 0.2526, 0.2888, 0.3037, 0.3145],
        "meta": {"shots": shots, "linestyle": "-"},
    },
    "EILEV BLIP-2 Flan-T5-xl": {
        "STS-CE": [0.3368, 0.5243, 0.5319, 0.5983, 0.6553, 0.6794, 0.6889],
        "STS-BE": [0.4282, 0.595, 0.6037, 0.6606, 0.726, 0.7484, 0.7569],
        "BERTScore-F1": [0.5189, 0.6147, 0.6184, 0.6258, 0.6347, 0.6396, 0.642],
        "ROUGE-L": [0.3103, 0.5236, 0.5448, 0.592, 0.644, 0.6605, 0.6653],
        "BLEU": [0.05684, 0.1503, 0.1947, 0.258, 0.3148, 0.3278, 0.3266],
        "meta": {"shots": shots, "linestyle": "-"},
    },
    "BLIP-2 OPT-2.7B": {
        "STS-CE": [0.1421, 0.3897, 0.4104, 0.432, 0.4536, 0.4386, 0.3707],
        "STS-BE": [0.3071, 0.4558, 0.507, 0.5324, 0.5551, 0.542, 0.475],
        "BERTScore-F1": [-1.763, -4.801, -0.3869, 0.3916, 0.5844, 0.5066, -0.6429],
        "ROUGE-L": [0.1011, 0.3885, 0.4722, 0.5006, 0.5223, 0.4982, 0.4214],
        "BLEU": [0.002703, 0.0845, 0.1575, 0.1663, 0.1752, 0.1583, 0.109],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "BLIP-2 Flan-T5-xl": {
        "STS-CE": [0.1224, 0.4742, 0.4243, 0.4405, 0.4649, 0.4819, 0.4915],
        "STS-BE": [0.284, 0.5453, 0.4864, 0.4999, 0.5323, 0.5552, 0.5683],
        "BERTScore-F1": [0.3838, 0.5617, 0.5734, 0.5749, 0.5726, 0.5693, 0.5662],
        "ROUGE-L": [0.05918, 0.4672, 0.4285, 0.439, 0.4708, 0.492, 0.5064],
        "BLEU": [0.0006598, 0.106, 0.1092, 0.1142, 0.1505, 0.1752, 0.1923],
        "meta": {"shots": shots, "linestyle": "--"},
    },
    "Kosmos-2": {
        "STS-CE": [0.318, 0.2947, 0.159],
        "STS-BE": [0.4035, 0.3773, 0.3012],
        "BERTScore-F1": [0.4957, 0.3589, 0.4132],
        "ROUGE-L": [0.3234, 0.1737, 0.1344],
        "BLEU": [0.03195, 0.005029, 0.004357],
        "meta": {"shots": [0, 1, 2], "linestyle": "--"},
    },
    "Otter": {
        "STS-CE": [0.3535, 0.3718, 0.3636, 0.3549, 0.3202, 0.3316, 0.3266],
        "STS-BE": [0.4189, 0.4439, 0.4315, 0.4318, 0.4154, 0.4249, 0.4191],
        "BERTScore-F1": [0.3509, 0.3798, 0.4137, 0.4486, 0.5063, 0.473, 0.47],
        "ROUGE-L": [0.2517, 0.2801, 0.303, 0.3121, 0.3278, 0.323, 0.3181],
        "BLEU": [0.003309, 0.003763, 0.004245, 0.004048, 0.004331, 0.003708, 0.003702],
        "meta": {"shots": shots, "linestyle": "--"},
    },
}

models = ["EILEV BLIP-2 OPT-2.7B", "EILEV BLIP-2 Flan-T5-xl", "Kosmos-2", "Otter"]
draw_graphs(model_data, models, metrics, "novel-rare-eval.pdf", shots, 2)