Import Libraries

In [None]:
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
from tabulate import tabulate

Metadata: there are 4 datasets, 25 base LLMs and 18 evaluation protocols

In [30]:
DATASETS = [
    "llmbar_natural",
    "llmbar_adversarial",
    "mtbench",
    "instrusum",
]  # 4 datasets

LLMS = [
    "llama-3.1-405b",
    "llama-3.1-70b",
    "llama-3.1-8b",
    "llama-3-70b",
    "llama-3-8b",
    "llama-2-70b",
    "llama-2-13b",
    "llama-2-7b",
    "tulu-2-dpo-70b",
    "tulu-2-70b",
    "tulu-2-dpo-13b",
    "tulu-2-13b",
    "tulu-2-dpo-7b",
    "tulu-2-7b",
    "mixtral-8x7b",
    "mistral-7b-v0.3",
    "qwen-2.5-72b",
    "qwen-2-72b",
    "qwen-1.5-72b",
    "qwen-1.5-32b",
    "gemma-7b",
    "gemma-2b",
    "glm-4-9b",
    "yi-1.5-34b",
    "yi-1.5-9b",
]  # 25 llms

PROTOCOLS = [
    "base",
    "cot",
    "metric",
    "reference",
    "metric+reference",
    "swap&synthesize",
    "fine-grained-diff",
    "multi-role-round1",
    "multi-role-round2",
    "multi-aspect-two",
    "multi-aspect-single",
    "gpt4-reference",
    "prepair",
    "cot&consistency",
    "protocol-consistency",
]  # 15 protocols

BENCHMARK_PROTOCOLS = [
    "alpacaeval",
    "arena_hard",
    "wildbench",
]  # 3 benchmark protocols

Load datasets

In [None]:
src_data = load_dataset("yale-nlp/ReIFE", "src")
predictions = load_dataset("yale-nlp/ReIFE", "predictions")

Inspect the data

In [32]:

src_data["llmbar_natural"][0].keys()

dict_keys(['winner', 'output_1', 'instruction', 'output_2'])

In [33]:
predictions["llmbar_natural"][0].keys()

dict_keys(['base_LLM', 'evaluation_protocol', 'predictions', 'swap_predictions'])

Perform meta-evaluation

In [34]:
def pairwise_compare(
    evaluator1_responses: list,
    evaluator2_responses: list,
) -> float:
    """
    Compare pairwise evaluators.

    Args:
        evaluator1: The responses from the first evaluator.
        evaluator2: The responses from the second evaluator.

    Returns:
        float: The evaluation accuracy.
    """
    assert len(evaluator1_responses) == len(evaluator2_responses)
    evaluator1_winners = np.array(evaluator1_responses)
    evaluator2_winners = np.array(evaluator2_responses)
    acc = (evaluator1_winners == evaluator2_winners).mean().item()
    return acc


def pairwise_meta_eval(
    human: list,
    model: str | list,
    model_swap: list | None = None,
) -> float:
    """
    Evaluate a pairwise evaluator.

    Args:
        human: The responses from the human evaluator.
        model: The responses from the model evaluator.
        model_swap: The responses from the model evaluator with swapped winners.

    Returns:
        float: The evaluation accuracy.
    """
    acc = pairwise_compare(human, model)
    if model_swap is not None:
        swap_acc = pairwise_compare(human, model_swap)
        acc = (acc + swap_acc) / 2
    return acc

In [None]:
results = {dataset: [] for dataset in DATASETS}

for dataset in tqdm(DATASETS):
    human_annotations = src_data[dataset]["winner"]
    for prediction in tqdm(predictions[dataset]):
        model_predictions = prediction["predictions"]
        model_swap_predictions = prediction["swap_predictions"]
        acc = pairwise_meta_eval(
            human_annotations, model_predictions, model_swap_predictions
        )
        results[dataset].append(
            {
                "acc": acc,
                "base_LLM": prediction["base_LLM"],
                "evaluation_protocol": prediction["evaluation_protocol"],
            }
        )

Average performance of the 25 base LLMs across the 15 evaluation protocols

In [36]:
llm_results = {llm: {dataset: [] for dataset in DATASETS} for llm in LLMS}
protocols = set(PROTOCOLS)

for llm in tqdm(LLMS):
    for dataset in DATASETS:
        for result in results[dataset]:
            if result["base_LLM"] == llm and result["evaluation_protocol"] in protocols:
                llm_results[llm][dataset].append(result["acc"] * 100)
        llm_results[llm][dataset] = np.mean(llm_results[llm][dataset])
    llm_results[llm]["avg"] = np.mean(list(llm_results[llm].values()))

table = []
for llm in LLMS:
    table.append([llm] + [llm_results[llm][dataset] for dataset in DATASETS + ["avg"]])
table = sorted(table, key=lambda x: x[-1], reverse=True)
print(tabulate(table, floatfmt=".02f", headers=["Model"] + DATASETS + ["avg"]))

100%|██████████| 25/25 [00:00<00:00, 4652.89it/s]

Model              llmbar_natural    llmbar_adversarial    mtbench    instrusum    avg
---------------  ----------------  --------------------  ---------  -----------  -----
llama-3.1-405b              94.13                 81.31      81.42        80.05  84.23
llama-3.1-70b               91.73                 80.23      81.02        75.30  82.07
qwen-2-72b                  91.57                 69.85      82.20        72.77  79.10
qwen-2.5-72b                89.57                 71.01      81.20        72.44  78.56
llama-3-70b                 88.20                 71.49      80.03        74.35  78.52
qwen-1.5-72b                86.10                 56.02      76.40        67.98  71.62
yi-1.5-34b                  86.53                 57.61      73.85        66.03  71.01
tulu-2-dpo-70b              84.23                 56.13      73.70        66.53  70.15
mixtral-8x7b                82.20                 54.73      73.78        66.44  69.29
tulu-2-70b                  83.60          




Average performance of the 15 evaluation protocols + 3 benchmark protocols across the 25 base LLMs

In [37]:
protocols = PROTOCOLS + BENCHMARK_PROTOCOLS
protocol_results = {protocol: {dataset: [] for dataset in DATASETS} for protocol in protocols}

for protocol in tqdm(protocols):
    for dataset in DATASETS:
        for result in results[dataset]:
            if result["evaluation_protocol"] == protocol:
                protocol_results[protocol][dataset].append(result["acc"] * 100)
        protocol_results[protocol][dataset] = np.mean(protocol_results[protocol][dataset])
    protocol_results[protocol]["avg"] = np.mean(list(protocol_results[protocol].values()))

table = []
for protocol in protocols:
    table.append([protocol] + [protocol_results[protocol][dataset] for dataset in DATASETS + ["avg"]])
table = sorted(table, key=lambda x: x[-1], reverse=True)
print(tabulate(table, floatfmt=".02f", headers=["Protocol"] + DATASETS + ["avg"]))

100%|██████████| 18/18 [00:00<00:00, 5247.62it/s]

Protocol                llmbar_natural    llmbar_adversarial    mtbench    instrusum    avg
--------------------  ----------------  --------------------  ---------  -----------  -----
prepair                          76.44                 61.77      69.72        63.82  67.94
gpt4-reference                   76.66                 57.97      70.12        65.99  67.68
metric+reference                 76.64                 58.25      69.98        65.62  67.62
protocol-consistency             76.34                 55.94      70.89        66.09  67.31
metric                           75.84                 56.22      70.74        65.67  67.12
reference                        76.16                 57.47      69.44        65.22  67.07
swap&synthesize                  75.62                 54.38      70.82        66.22  66.76
cot&consistency                  74.94                 54.08      70.55        65.38  66.24
base                             74.72                 53.53      70.67        6


