In [1]:
import json
import os

def load_results(workdir: str):
    with open(os.path.join(workdir, "eval_results.json"), "r") as f:
        return json.loads(f.read())

def check_failures(results):
    failed_humaneval = []
    failed_humaneval_plus = []

    for task_id in results['eval']:
        if results['eval'][task_id][0]['base_status'] != "pass":
            failed_humaneval.append(task_id)
        if results['eval'][task_id][0]['plus_status'] != "pass":
            failed_humaneval_plus.append(task_id)

    return failed_humaneval, failed_humaneval_plus

def compare_results(dir1: str, dir2: str):
    results1 = load_results(dir1)
    results2 = load_results(dir2)

    failed1_base, failed1_plus = check_failures(results1)
    failed2_base, failed2_plus = check_failures(results2)

    name1 = dir1.split("/")[-1]
    name2 = dir2.split("/")[-1]

    print(f"Failures in {name1}:")
    print(f"HumanEval: {len(failed1_base)}, HumanEval+: {len(failed1_plus)}")
    print(f"\nFailures in {name2}:")
    print(f"HumanEval: {len(failed2_base)}, HumanEval+: {len(failed2_plus)}")

    print(f"\nTasks where {name1} passes but {name2} fails (HumanEval):")
    print(set(failed2_base) - set(failed1_base))

    print(f"\nTasks where {name2} passes but {name1} fails (HumanEval):")
    print(set(failed1_base) - set(failed2_base))

    print(f"\nTasks where {name1} passes but {name2} fails (HumanEval+):")
    print(set(failed2_plus) - set(failed1_plus))

    print(f"\nTasks where {name2} passes but {name1} fails (HumanEval+):")
    print(set(failed1_plus) - set(failed2_plus))


# Example usage
DIR1 = "Phi-3-mini-4k-instruct-output/gold_plan"
DIR2 = "Phi-3-mini-4k-instruct-output/llama_gold"

compare_results(DIR1, DIR2)

Failures in gold_plan:
HumanEval: 18, HumanEval+: 31

Failures in llama_gold:
HumanEval: 20, HumanEval+: 35

Tasks where gold_plan passes but llama_gold fails (HumanEval):
{'HumanEval/10', 'HumanEval/137', 'HumanEval/130', 'HumanEval/126', 'HumanEval/129', 'HumanEval/116', 'HumanEval/77', 'HumanEval/87', 'HumanEval/128', 'HumanEval/119'}

Tasks where llama_gold passes but gold_plan fails (HumanEval):
{'HumanEval/80', 'HumanEval/37', 'HumanEval/100', 'HumanEval/113', 'HumanEval/109', 'HumanEval/38', 'HumanEval/75', 'HumanEval/62'}

Tasks where gold_plan passes but llama_gold fails (HumanEval+):
{'HumanEval/10', 'HumanEval/137', 'HumanEval/130', 'HumanEval/126', 'HumanEval/129', 'HumanEval/25', 'HumanEval/77', 'HumanEval/97', 'HumanEval/87', 'HumanEval/128', 'HumanEval/119', 'HumanEval/86', 'HumanEval/154'}

Tasks where llama_gold passes but gold_plan fails (HumanEval+):
{'HumanEval/80', 'HumanEval/37', 'HumanEval/72', 'HumanEval/100', 'HumanEval/113', 'HumanEval/109', 'HumanEval/38', 'H