In [1]:
import json
import os

def load_results(workdir: str):
    with open(os.path.join(workdir, "eval_results.json"), "r") as f:
        return json.loads(f.read())

def check_failures(results):
    failed_humaneval = []
    failed_humaneval_plus = []

    for task_id in results['eval']:
        if results['eval'][task_id][0]['base_status'] != "pass":
            failed_humaneval.append(task_id)
        if results['eval'][task_id][0]['plus_status'] != "pass":
            failed_humaneval_plus.append(task_id)

    return failed_humaneval, failed_humaneval_plus

def compare_results(dir1: str, dir2: str):
    results1 = load_results(dir1)
    results2 = load_results(dir2)

    failed1_base, failed1_plus = check_failures(results1)
    failed2_base, failed2_plus = check_failures(results2)

    name1 = dir1.split("/")[-1]
    name2 = dir2.split("/")[-1]

    print(f"Failures in {name1}:")
    print(f"HumanEval: {len(failed1_base)}, HumanEval+: {len(failed1_plus)}")
    print(f"\nFailures in {name2}:")
    print(f"HumanEval: {len(failed2_base)}, HumanEval+: {len(failed2_plus)}")

    print(f"\nTasks where {name1} passes but {name2} fails (HumanEval):")
    print(set(failed2_base) - set(failed1_base))

    print(f"\nTasks where {name2} passes but {name1} fails (HumanEval):")
    print(set(failed1_base) - set(failed2_base))

    print(f"\nTasks where {name1} passes but {name2} fails (HumanEval+):")
    print(set(failed2_plus) - set(failed1_plus))

    print(f"\nTasks where {name2} passes but {name1} fails (HumanEval+):")
    print(set(failed1_plus) - set(failed2_plus))


# Example usage
DIR1 = "Phi-3-mini-4k-instruct-output/self"
DIR2 = "Phi-3-mini-4k-instruct-output/planner_non_gran"

compare_results(DIR1, DIR2)

Failures in self:
HumanEval: 66, HumanEval+: 72

Failures in planner_non_gran:
HumanEval: 62, HumanEval+: 74

Tasks where self passes but planner_non_gran fails (HumanEval):
{'HumanEval/101', 'HumanEval/38', 'HumanEval/100', 'HumanEval/160', 'HumanEval/40', 'HumanEval/20', 'HumanEval/1', 'HumanEval/139', 'HumanEval/57', 'HumanEval/67', 'HumanEval/6', 'HumanEval/85', 'HumanEval/103', 'HumanEval/146', 'HumanEval/76', 'HumanEval/89', 'HumanEval/11', 'HumanEval/9', 'HumanEval/161', 'HumanEval/25', 'HumanEval/75'}

Tasks where planner_non_gran passes but self fails (HumanEval):
{'HumanEval/73', 'HumanEval/55', 'HumanEval/84', 'HumanEval/54', 'HumanEval/87', 'HumanEval/64', 'HumanEval/154', 'HumanEval/39', 'HumanEval/97', 'HumanEval/111', 'HumanEval/128', 'HumanEval/94', 'HumanEval/24', 'HumanEval/37', 'HumanEval/149', 'HumanEval/10', 'HumanEval/5', 'HumanEval/49', 'HumanEval/63', 'HumanEval/17', 'HumanEval/86', 'HumanEval/156', 'HumanEval/30', 'HumanEval/79', 'HumanEval/96'}

Tasks where se