# Shuffled In-Context Video Clips Evaluation Figures

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

shots = [2, 4, 8, 12, 16]
metrics = ["STS-BE"]


def calc_percent_diff(new, ref):
    return (new - ref) / ref * 100


def draw_graphs(variants, metrics, fig_name, shots):
    sns.set_theme(style="darkgrid")
    plt.rcParams["font.family"] = "Caladea"
    fig, axs = plt.subplots(
        len(metrics),
        len(variants),
        figsize=(8 * len(variants), 8 * len(metrics)),
        sharex=True,
        sharey=True,
    )

    # Iterate through each model to create a subplot
    for i, metric in enumerate(metrics):
        for j, (variant_name, variant_data) in enumerate(variants.items()):
            if len(metrics) == 1:
                ax = axs
            else:
                ax = axs[i]
            if len(variants) != 1:
                ax = axs[j]
            ax.tick_params(axis="both", which="major", labelsize=30)
            ax.axhline(0, color="black", linewidth=6, linestyle="--")
            if i == 0:
                ax.set_title(variant_name, fontsize=36, fontweight="bold")
            for model_name, data in variant_data.items():
                sns.lineplot(
                    x=[str(shot) for shot in data["meta"]["shots"]],
                    y=data[metric][: len(data["meta"]["shots"])],
                    ax=ax,
                    label=f"{model_name}",
                    linestyle=data["meta"]["linestyle"],
                    linewidth=8,
                    marker="D",
                    markersize=16,
                )
            if i == len(metrics) - 1:
                ax.set_xlabel("Shot", fontsize=36, fontweight="bold")
            if j == 0:
                ax.set_ylabel(f"% Diff. {metric}", fontsize=36, fontweight="bold")
            ax.set_xticks(ticks=range(len(shots)), labels=[str(shot) for shot in shots])
            ax.grid(True)
            ax.get_legend().remove()

    handles, labels = ax.get_legend_handles_labels()
    fig.legend(
        handles,
        labels,
        loc="lower center",
        bbox_to_anchor=(0.5, 1),
        fontsize=36,
        ncols=2,
    )

    # Adjust layout to prevent overlap
    plt.tight_layout()
    # bbox_inches="tight" ensures that all the visible content
    # is saved into the pdf file.
    plt.savefig(fig_name, bbox_inches="tight")
    plt.show()

## Shuffle In-Context Video Clips

In [None]:
import numpy as np

shuffled_blip_2_opt_27b = {
    "STS-CE": np.array([0.4004, 0.429, 0.4535, 0.4438, 0.3765]),
    "STS-BE": np.array([0.486, 0.5202, 0.546, 0.541, 0.4715]),
    "BERTScore-F1": np.array([-0.8172, 0.3585, 0.5949, 0.5467, -0.3248]),
    "ROUGE-L": np.array([0.4626, 0.4966, 0.5135, 0.4975, 0.4221]),
    "BLEU": np.array([0.1607, 0.1738, 0.1819, 0.1695, 0.1141]),
}
blip_2_opt_27b = {
    "STS-CE": np.array([0.4012, 0.432, 0.4575, 0.4469, 0.3809]),
    "STS-BE": np.array([0.4862, 0.5222, 0.5486, 0.5422, 0.4755]),
    "BERTScore-F1": np.array([-0.9048, 0.3596, 0.5919, 0.5613, -0.315]),
    "ROUGE-L": np.array([0.4612, 0.5006, 0.5204, 0.5019, 0.4271]),
    "BLEU": np.array([0.1639, 0.1828, 0.1926, 0.1766, 0.1203]),
}

shuffled_blip_2_flan_t5_xl = {
    "STS-CE": np.array([0.4303, 0.4506, 0.4723, 0.4825, 0.489]),
    "STS-BE": np.array([0.4829, 0.5013, 0.5315, 0.549, 0.5586]),
    "BERTScore-F1": np.array([0.5803, 0.5819, 0.5787, 0.5745, 0.5724]),
    "ROUGE-L": np.array([0.4335, 0.4453, 0.472, 0.489, 0.4985]),
    "BLEU": np.array([0.123, 0.1309, 0.1644, 0.1874, 0.1966]),
}
blip_2_flan_t5_xl = {
    "STS-CE": np.array([0.4371, 0.4532, 0.476, 0.4812, 0.5006]),
    "STS-BE": np.array([0.4877, 0.502, 0.5315, 0.545, 0.5681]),
    "BERTScore-F1": np.array([0.5802, 0.5833, 0.5829, 0.5792, 0.5765]),
    "ROUGE-L": np.array([0.4291, 0.4403, 0.4684, 0.4846, 0.5056]),
    "BLEU": np.array([0.1188, 0.1258, 0.1609, 0.1771, 0.2052]),
}

shuffled_eilev_blip_2_opt_27b = {
    "STS-CE": np.array([0.46, 0.5263, 0.5916, 0.6178, 0.6294]),
    "STS-BE": np.array([0.5483, 0.6042, 0.6572, 0.6776, 0.6879]),
    "BERTScore-F1": np.array([0.6377, 0.6442, 0.6531, 0.6538, 0.6529]),
    "ROUGE-L": np.array([0.5328, 0.5702, 0.6034, 0.6198, 0.6264]),
    "BLEU": np.array([0.2012, 0.24, 0.2689, 0.2904, 0.301]),
}
eilev_blip_2_opt_27b = {
    "STS-CE": np.array([0.4897, 0.5569, 0.612, 0.6312, 0.6363]),
    "STS-BE": np.array([0.571, 0.6284, 0.6735, 0.6898, 0.6936]),
    "BERTScore-F1": np.array([0.6399, 0.6463, 0.6543, 0.6539, 0.6529]),
    "ROUGE-L": np.array([0.5396, 0.5785, 0.6102, 0.6249, 0.6296]),
    "BLEU": np.array([0.2015, 0.2443, 0.2741, 0.2968, 0.3049]),
}

shuffled_eilev_blip_2_flan_t5_xl = {
    "STS-CE": np.array([0.4978, 0.5404, 0.598, 0.6188, 0.6272]),
    "STS-BE": np.array([0.5682, 0.6052, 0.6613, 0.6817, 0.6893]),
    "BERTScore-F1": np.array([0.6419, 0.6494, 0.6538, 0.6579, 0.6594]),
    "ROUGE-L": np.array([0.5353, 0.5674, 0.6058, 0.6188, 0.6223]),
    "BLEU": np.array([0.2148, 0.2452, 0.2831, 0.2892, 0.2877]),
}
eilev_blip_2_flan_t5_xl = {
    "STS-CE": np.array([0.5176, 0.5539, 0.6089, 0.6276, 0.6349]),
    "STS-BE": np.array([0.5812, 0.613, 0.6689, 0.6886, 0.6948]),
    "BERTScore-F1": np.array([0.6394, 0.6477, 0.6527, 0.6561, 0.6572]),
    "ROUGE-L": np.array([0.5322, 0.5648, 0.607, 0.6203, 0.623]),
    "BLEU": np.array([0.1992, 0.2373, 0.2834, 0.2931, 0.2913]),
}

models = {
    "BLIP-2": {
        "OPT-2.7B": dict(
            {
                metric: calc_percent_diff(
                    shuffled_blip_2_opt_27b[metric], blip_2_opt_27b[metric]
                )
                for metric in shuffled_blip_2_opt_27b
            },
            meta={"shots": shots, "linestyle": "-"},
        ),
        "Flan-T5-xl": dict(
            {
                metric: calc_percent_diff(
                    shuffled_blip_2_flan_t5_xl[metric], blip_2_flan_t5_xl[metric]
                )
                for metric in shuffled_blip_2_flan_t5_xl.keys()
            },
            meta={"shots": shots, "linestyle": "-"},
        ),
    },
    "EILEV BLIP-2": {
        "OPT-2.7B": dict(
            {
                metric: calc_percent_diff(
                    shuffled_eilev_blip_2_opt_27b[metric], eilev_blip_2_opt_27b[metric]
                )
                for metric in shuffled_eilev_blip_2_opt_27b
            },
            meta={"shots": shots, "linestyle": "-"},
        ),
        "Flan-T5-xl": dict(
            {
                metric: calc_percent_diff(
                    shuffled_eilev_blip_2_flan_t5_xl[metric],
                    eilev_blip_2_flan_t5_xl[metric],
                )
                for metric in shuffled_eilev_blip_2_flan_t5_xl
            },
            meta={"shots": shots, "linestyle": "-"},
        ),
    },
}

draw_graphs(models, metrics, "shuffle-in-context.pdf", shots)