# Held-out Verb/Noun Pair Evaluation

## Semantic Similarity Metrics

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

shots = [0, 1, 2, 4, 8, 12, 16]

# EILEV-trained BLIP-2 OPT-2.7B
eilev_blip2_opt_2_7b = {
    # 0, 1, 2, 4, 8, 12, 16 shots
    "STS-CE": [0.222, 0.5117, 0.51, 0.5939, 0.6602, 0.6775, 0.6814],
    "STS-BE": [0.3672, 0.5864, 0.5978, 0.6727, 0.7311, 0.7466, 0.7512],
    # 'BERTScore-F1': [0.5451, 0.6229, 0.6323, 0.639, 0.6444, 0.6449, 0.6448],
}

# EILEV Flan-T5-xl
eilev_flan_t5_xl = {
    # 0, 1, 2, 4, 8, 12, 16 shots
    "STS-CE": [0.3368, 0.5243, 0.5319, 0.5983, 0.6553, 0.6794, 0.6889],
    "STS-BE": [0.4282, 0.595, 0.6037, 0.6606, 0.726, 0.7484, 0.7569],
    # 'BERTScore-F1': [0.5189, 0.6147, 0.6184, 0.6258, 0.6347, 0.6396, 0.642],
}

# BLIP-2 OPT-2.7B (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_opt_2_7b = {
    "STS-CE": [0.1421, 0.3897, 0.4104, 0.432, 0.4536, 0.4386, 0.3707],
    "STS-BE": [0.3071, 0.4558, 0.507, 0.5324, 0.5551, 0.542, 0.475],
    # 'BERTScore-F1': [-1.763, -4.801, -0.3869, 0.3916, 0.5844, 0.5066, -0.6429],
}

# BLIP-2 Flan-T5-xl (baseline)
# 0, 1, 2, 4, 8, 16 shots
blip2_flan_t5_xl = {
    "STS-CE": [0.1224, 0.4742, 0.4243, 0.4405, 0.4649, 0.4819, 0.4915],
    "STS-BE": [0.284, 0.5453, 0.4864, 0.4999, 0.5323, 0.5552, 0.5683],
    # 'BERTScore-F1': [0.3838, 0.5617, 0.5734, 0.5749, 0.5726, 0.5693, 0.5662],
}

# Plotting
plt.figure(figsize=(10, 5))

plt.title(
    "Held-out Verb/Noun Pair Eval: Semantic-similarity-based Metrics",
    y=1.25,
    fontweight="bold",
)

# EILEV models
for metric, values in eilev_blip2_opt_2_7b.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 OPT-2.7B {metric}", marker="o")

for metric, values in eilev_flan_t5_xl.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 Flan-T5-xl {metric}", marker="s")

# Baselines
for metric, values in blip2_opt_2_7b.items():
    plt.plot(
        shots, values, label=f"BLIP-2 OPT-2.7B {metric}", linestyle="--", marker="^"
    )

for metric, values in blip2_flan_t5_xl.items():
    plt.plot(
        shots, values, label=f"BLIP-2 Flan-T5-xl {metric}", linestyle="--", marker=">"
    )

plt.xlabel("Shots")
plt.ylabel("Score")
plt.xticks(shots, labels=[str(s) for s in shots])
plt.xlim(0, max(shots))
plt.legend(loc="lower center", bbox_to_anchor=(0.5, 1), ncol=2)
plt.grid(True)
plt.show()

## N-gram Metrics

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

shots = [0, 1, 2, 4, 8, 12, 16]

# EILEV-trained BLIP-2 OPT-2.7B
# 0, 1, 2, 4, 8, 12, 16 shots
eilev_blip2_opt_2_7b = {
    "RougeL": [0.1916, 0.5245, 0.5601, 0.6069, 0.6424, 0.6547, 0.6606],
    "BLEU": [0.01493, 0.1183, 0.2028, 0.2526, 0.2888, 0.3037, 0.3145],
}

# EILEV Flan-T5-xl
# 0, 1, 2, 4, 12, 16 shots
eilev_flan_t5_xl = {
    "RougeL": [0.3103, 0.5236, 0.5448, 0.592, 0.644, 0.6605, 0.6653],
    "BLEU": [0.05684, 0.1503, 0.1947, 0.258, 0.3148, 0.3278, 0.3266],
}

# BLIP-2 OPT-2.7B (baseline)
# 0, 1, 2, 4, 8, 12, 16 shots
blip2_opt_2_7b = {
    "RougeL": [0.1011, 0.3885, 0.4722, 0.5006, 0.5223, 0.4982, 0.4214],
    "BLEU": [0.002703, 0.0845, 0.1575, 0.1663, 0.1752, 0.1583, 0.109],
}

# BLIP-2 Flan-T5-xl (baseline)
# 0, 1, 2, 4, 12, 16 shots
blip2_flan_t5_xl = {
    "RougeL": [0.05918, 0.4672, 0.4285, 0.439, 0.4708, 0.492, 0.5064],
    "BLEU": [0.0006598, 0.106, 0.1092, 0.1142, 0.1505, 0.1752, 0.1923],
}

# Plotting
plt.figure(figsize=(10, 5))

plt.title(
    "Held-out Verb/Noun Pair Eval: N-gram-based Metrics", y=1.25, fontweight="bold"
)

# EILEV models
for metric, values in eilev_blip2_opt_2_7b.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 OPT-2.7B {metric}", marker="o")

for metric, values in eilev_flan_t5_xl.items():
    plt.plot(shots, values, label=f"EILEV BLIP-2 Flan-T5-xl {metric}", marker="s")

# Baselines
for metric, values in blip2_opt_2_7b.items():
    plt.plot(
        shots, values, label=f"BLIP-2 OPT-2.7B {metric}", linestyle="--", marker="^"
    )

for metric, values in blip2_flan_t5_xl.items():
    plt.plot(
        shots, values, label=f"BLIP-2 Flan-T5-xl {metric}", linestyle="--", marker=">"
    )

plt.xlabel("Shots")
plt.ylabel("Score")
plt.xticks(shots, labels=[str(s) for s in shots])
plt.xlim(0, max(shots))
plt.legend(loc="lower center", bbox_to_anchor=(0.5, 1), ncol=3)
plt.grid(True)
plt.show()