# Ego4D Eval

## Semantic Similarity Metrics

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

shots = [0, 1, 2, 4, 8, 12, 16]

# BLIP-2 OPT-2.7B (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_opt_2_7b = {
    "STS-CE": [0.1505, 0.3586, 0.4012, 0.432, 0.4575, 0.4469, 0.3809],
    "STS-BE": [0.3094, 0.4233, 0.4862, 0.5222, 0.5486, 0.5422, 0.4755],
    # 'BERTScore-F1': [-1.716, -5.206, -0.9048, 0.3596, 0.5919, 0.5613, -0.315],
}

# BLIP-2 Flan-T5-xl (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_flan_t5_xl = {
    "STS-CE": [0.1327, 0.4551, 0.4371, 0.4532, 0.476, 0.4812, 0.5006],
    "STS-BE": [0.2861, 0.5235, 0.4877, 0.502, 0.5315, 0.545, 0.5681],
    # 'BERTScore-F1': [0.3864, 0.576, 0.5802, 0.5833, 0.5829, 0.5792, 0.5765],
}

# EILEV-trained BLIP-2 OPT-2.7B
# 0, 1, 2, 4, 8, 12, 16 shots
eilev_blip2_opt_2_7b = {
    "STS-CE": [0.2098, 0.4754, 0.4897, 0.5569, 0.612, 0.6312, 0.6363],
    "STS-BE": [0.3278, 0.5495, 0.571, 0.6284, 0.6735, 0.6898, 0.6936],
    # 'BERTScore-F1': [0.5234, 0.6305, 0.6399, 0.6463, 0.6543, 0.6539, 0.6529],
}

# EILEV Flan-T5-xl
# 0, 1, 2, 4, 8, 12, 16 shots
eilev_flan_t5_xl = {
    "STS-CE": [0.3552, 0.5039, 0.5176, 0.5539, 0.6089, 0.6276, 0.6349],
    "STS-BE": [0.426, 0.5697, 0.5812, 0.613, 0.6689, 0.6886, 0.6948],
    # 'BERTScore-F1': [-1.84, 0.6291, 0.6394, 0.6477, 0.6527, 0.6561, 0.6572],
}

# Kosmos-2
# 0, 1, 2 shots
kosmos2 = {
    "STS-CE": [0.3261, 0.3052, 0.17],
    "STS-BE": [0.4014, 0.3732, 0.2999],
    # 'BERTScore-F1': [0.4971, 0.364, 0.4123],
}

# Otter
# 0, 1, 2, 4, 8, 12, 16 shots
otter = {
    "STS-CE": [0.3514, 0.324, 0.2897, 0.271, 0.2836, 0.2912, 0.2961],
    "STS-BE": [0.4074, 0.417, 0.3905, 0.3758, 0.371, 0.3682, 0.364],
    # 'BERTScore-F1': [0.3534, 0.3165, 0.3044, 0.2895, 0.2884, 0.289, 0.2808],
}

# Plotting
plt.figure(figsize=(10, 5))

plt.title("Ego4D: Semantic-similarity-based Metrics", y=1.25, fontweight="bold")

# EILEV models
for metric, values in eilev_blip2_opt_2_7b.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 OPT-2.7B {metric}", marker="o")

for metric, values in eilev_flan_t5_xl.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 Flan-T5-xl {metric}", marker="s")

# Baselines
for metric, values in blip2_opt_2_7b.items():
    plt.plot(
        shots, values, label=f"BLIP-2 OPT-2.7B {metric}", linestyle="--", marker="^"
    )

for metric, values in blip2_flan_t5_xl.items():
    plt.plot(
        shots, values, label=f"BLIP-2 Flan-T5-xl {metric}", linestyle="--", marker=">"
    )

for metric, values in kosmos2.items():
    plt.plot([0, 1, 2], values, label=f"Kosmos-2 {metric}", linestyle="--", marker="v")

for metric, values in otter.items():
    plt.plot(shots, values, label=f"Otter {metric}", linestyle="--", marker=">")

plt.xlabel("Shots")
plt.ylabel("Score")
plt.xticks(shots, labels=[str(s) for s in shots])
plt.xlim(0, max(shots))
plt.legend(loc="lower center", bbox_to_anchor=(0.5, 1), ncol=3)
plt.grid(True)
plt.show()


## N-gram Metrics

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

shots = [0, 1, 2, 4, 8, 12, 16]

# BLIP-2 OPT-2.7B (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_opt_2_7b = {
    "RougeL": [0.09565, 0.3769, 0.4612, 0.5006, 0.5204, 0.5019, 0.4271],
    "BLEU": [0.003134, 0.1139, 0.1639, 0.1828, 0.1926, 0.1766, 0.1203],
}

# BLIP-2 Flan-T5-xl (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_flan_t5_xl = {
    "RougeL": [0.0598, 0.4606, 0.4291, 0.4403, 0.4684, 0.4846, 0.5056],
    "BLEU": [0.0005683, 0.1405, 0.1188, 0.1258, 0.1609, 0.1771, 0.2052],
}

# EILEV-trained BLIP-2 OPT-2.7B
# 0, 1, 2, 4, 8, 12, 16 shots
eilev_blip2_opt_2_7b = {
    "RougeL": [0.2315, 0.5013, 0.5396, 0.5785, 0.6102, 0.6249, 0.6296],
    "BLEU": [0.008795, 0.1376, 0.2015, 0.2443, 0.2741, 0.2968, 0.3049],
}

# EILEV Flan-T5-xl
# 0, 1, 2, 4, 8, 12, 16 shots
eilev_flan_t5_xl = {
    "RougeL": [0.3129, 0.5032, 0.5322, 0.5648, 0.607, 0.6203, 0.623],
    "BLEU": [0.06718, 0.1507, 0.1992, 0.2373, 0.2834, 0.2931, 0.2913],
}

# Kosmos-2
# 0, 1, 2 shots
kosmos2 = {
    "RougeL": [0.3258, 0.176, 0.1376],
    "BLEU": [0.03476, 0.005267, 0.005217],
}

# Otter
# 0, 1, 2, 4, 8, 12, 16 shots
otter = {
    "RougeL": [0.2555, 0.2337, 0.2234, 0.2038, 0.1906, 0.1902, 0.1846],
    "BLEU": [0.003195, 0.001461, 0.001018, 0.001041, 0.0008764, 0.0004788, 0.0005822],
}

# Plotting
plt.figure(figsize=(10, 5))

plt.title("Ego4D: N-gram-based Metrics", y=1.25, fontweight="bold")

# EILEV models
for metric, values in eilev_blip2_opt_2_7b.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 OPT-2.7B {metric}", marker="o")

for metric, values in eilev_flan_t5_xl.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 Flan-T5-xl {metric}", marker="s")

# Baselines
for metric, values in blip2_opt_2_7b.items():
    plt.plot(
        shots, values, label=f"BLIP-2 OPT-2.7B {metric}", linestyle="--", marker="^"
    )

for metric, values in blip2_flan_t5_xl.items():
    plt.plot(
        shots, values, label=f"BLIP-2 Flan-T5-xl {metric}", linestyle="--", marker=">"
    )

for metric, values in kosmos2.items():
    plt.plot([0, 1, 2], values, label=f"Kosmos-2 {metric}", linestyle="--", marker="v")

for metric, values in otter.items():
    plt.plot(shots, values, label=f"Otter {metric}", linestyle="--", marker=">")

plt.xlabel("Shots")
plt.ylabel("Score")
plt.xticks(shots, labels=[str(s) for s in shots])
plt.xlim(0, max(shots))
plt.legend(loc="lower center", bbox_to_anchor=(0.5, 1), ncol=3)
plt.grid(True)
plt.show()


# EPIC-KITCHENS Eval

## Semantic Similarity Metrics

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

shots = [0, 1, 2, 4, 8, 12, 16]

# BLIP-2 OPT-2.7B (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_opt_2_7b = {
    "STS-CE": [0.08659, 0.6325, 0.541, 0.5713, 0.6197, 0.545, 0.4651],
    "STS-BE": [0.2806, 0.7269, 0.6322, 0.6676, 0.7203, 0.6479, 0.5928],
    # 'BERTScore-F1': [0.3099, -0.6529, 0.4143, 0.673, 0.6883, 0.6381, 0.4607],
}

# BLIP-2 Flan-T5-xl (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_flan_t5_xl = {
    "STS-CE": [0.08193, 0.5044, 0.4155, 0.4518, 0.5141, 0.5509, 0.5766],
    "STS-BE": [0.2809, 0.5937, 0.487, 0.5317, 0.6066, 0.6589, 0.6941],
    # 'BERTScore-F1': [0.3219, 0.5882, 0.5754, 0.5959, 0.6237, 0.6296, 0.6124],
}

# EILEV-trained BLIP-2 OPT-2.7B
# 0, 1, 2, 4, 8, 12, 16 shots
eilev_blip2_opt_2_7b = {
    "STS-CE": [0.2151, 0.6128, 0.6099, 0.7724, 0.8081, 0.819, 0.8234],
    "STS-BE": [0.3605, 0.7313, 0.7119, 0.841, 0.8642, 0.8675, 0.8704],
    # 'BERTScore-F1': [0.4697, 0.678, 0.679, 0.6873, 0.6904, 0.6915, 0.694],
}

# EILEV Flan-T5-xl
# 0, 1, 2, 4, 8, 12, 16 shots
eilev_flan_t5_xl = {
    "STS-CE": [0.3134, 0.5512, 0.565, 0.6396, 0.7813, 0.8251, 0.8399],
    "STS-BE": [0.3829, 0.6612, 0.6767, 0.7455, 0.8512, 0.8815, 0.8916],
    # 'BERTScore-F1': [-0.7901, 0.6835, 0.686, 0.6995, 0.6915, 0.6922],
}

# Kosmos-2
# 0, 1, 2 shots
kosmos2 = {
    "STS-CE": [0.3879, 0.2909, 0.1445],
    "STS-BE": [0.4214, 0.3846, 0.2756],
    # 'BERTScore-F1': [0.555, 0.4582, 0.4502],
}

# Otter
# 0, 1, 2, 4, 8, 12, 16 shots
otter = {
    "STS-CE": [0.3787, 0.3698, 0.357, 0.3353, 0.3211, 0.3286, 0.3102],
    "STS-BE": [0.416, 0.4589, 0.4496, 0.4199, 0.3777, 0.3592, 0.3446],
    # 'BERTScore-F1': [0.3554, 0.3554, 0.355, 0.3112, 0.2854, 0.2754, 0.2293],
}

# Plotting
plt.figure(figsize=(10, 5))

plt.title("EPIC-KITCHENS: Semantic-similarity-based Metrics", y=1.25, fontweight="bold")

# EILEV models
for metric, values in eilev_blip2_opt_2_7b.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 OPT-2.7B {metric}", marker="o")

for metric, values in eilev_flan_t5_xl.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 Flan-T5-xl {metric}", marker="s")

# Baselines
for metric, values in blip2_opt_2_7b.items():
    plt.plot(
        shots, values, label=f"BLIP-2 OPT-2.7B {metric}", linestyle="--", marker="^"
    )

for metric, values in blip2_flan_t5_xl.items():
    plt.plot(
        shots, values, label=f"BLIP-2 Flan-T5-xl {metric}", linestyle="--", marker=">"
    )

for metric, values in kosmos2.items():
    plt.plot([0, 1, 2], values, label=f"Kosmos-2 {metric}", linestyle="--", marker="v")

for metric, values in otter.items():
    plt.plot(shots, values, label=f"Otter {metric}", linestyle="--", marker=">")

plt.xlabel("Shots")
plt.ylabel("Score")
plt.xticks(shots, labels=[str(s) for s in shots])
plt.xlim(0, max(shots))
plt.legend(loc="lower center", bbox_to_anchor=(0.5, 1), ncol=3)
plt.grid(True)
plt.show()


## N-gram Metrics

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

shots = [0, 1, 2, 4, 8, 12, 16]

# BLIP-2 OPT-2.7B (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_opt_2_7b = {
    "RougeL": [0.04052, 0.7076, 0.6738, 0.7125, 0.7676, 0.6322, 0.5593],
    "BLEU": [0.001951, 0.3479, 0.4054, 0.4363, 0.5027, 0.2774, 0.1263],
}

# BLIP-2 Flan-T5-xl (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_flan_t5_xl = {
    "RougeL": [0.02271, 0.5629, 0.4685, 0.5268, 0.6235, 0.6853, 0.723],
    "BLEU": [0.0, 0.1956, 0.1762, 0.2238, 0.3021, 0.3599, 0.3958],
}

# EILEV-trained BLIP-2 OPT-2.7B
# 0, 1, 2, 4, 8, 12, 16 shots
eilev_blip2_opt_2_7b = {
    "RougeL": [0.204, 0.724, 0.7549, 0.8483, 0.8665, 0.8714, 0.8725],
    "BLEU": [0.01428, 0.2566, 0.4666, 0.6352, 0.6608, 0.6645, 0.6677],
}

# EILEV Flan-T5-xl
# 0, 1, 2, 4, 8, 12, 16 shots
eilev_flan_t5_xl = {
    "RougeL": [0.3009, 0.6629, 0.7167, 0.7788, 0.8583, 0.8813, 0.8904],
    "BLEU": [0.03337, 0.2287, 0.4043, 0.5166, 0.6673, 0.7125, 0.7325],
}

# Kosmos-2
# 0, 1, 2 shots
kosmos2 = {
    "RougeL": [0.4424, 0.2526, 0.1448],
    "BLEU": [0.04952, 0.007954, 0.004721],
}

# Otter
# 0, 1, 2, 4, 8, 12, 16 shots
otter = {
    "RougeL": [0.2318, 0.2513, 0.2499, 0.2164, 0.1827, 0.1731, 0.1497],
    "BLEU": [0.001755, 0.0005584, 0.0, 0.0004919, 0.0, 0.0003501, 0.000305],
}

# Plotting
plt.figure(figsize=(10, 5))

plt.title("EPIC-KITCHENS: N-gram-based Metrics", y=1.25, fontweight="bold")

# EILEV models
for metric, values in eilev_blip2_opt_2_7b.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 OPT-2.7B {metric}", marker="o")

for metric, values in eilev_flan_t5_xl.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 Flan-T5-xl {metric}", marker="s")

# Baselines
for metric, values in blip2_opt_2_7b.items():
    plt.plot(
        shots, values, label=f"BLIP-2 OPT-2.7B {metric}", linestyle="--", marker="^"
    )

for metric, values in blip2_flan_t5_xl.items():
    plt.plot(
        shots, values, label=f"BLIP-2 Flan-T5-xl {metric}", linestyle="--", marker=">"
    )

for metric, values in kosmos2.items():
    plt.plot([0, 1, 2], values, label=f"Kosmos-2 {metric}", linestyle="--", marker="v")

for metric, values in otter.items():
    plt.plot(shots, values, label=f"Otter {metric}", linestyle="--", marker=">")

plt.xlabel("Shots")
plt.ylabel("Score")
plt.xticks(shots, labels=[str(s) for s in shots])
plt.xlim(0, max(shots))
plt.legend(loc="lower center", bbox_to_anchor=(0.5, 1), ncol=3)
plt.grid(True)
plt.show()
