In [2]:
import json
import os

def load_results(workdir: str):
    with open(os.path.join(workdir, "eval_results.json"), "r") as f:
        return json.loads(f.read())

def check_failures(results):
    failed_humaneval = []
    failed_humaneval_plus = []

    for task_id in results['eval']:
        if results['eval'][task_id][0]['base_status'] != "pass":
            failed_humaneval.append(task_id)
        if results['eval'][task_id][0]['plus_status'] != "pass":
            failed_humaneval_plus.append(task_id)

    return failed_humaneval, failed_humaneval_plus

def compare_results(dir1: str, dir2: str):
    results1 = load_results(dir1)
    results2 = load_results(dir2)

    failed1_base, failed1_plus = check_failures(results1)
    failed2_base, failed2_plus = check_failures(results2)

    name1 = dir1.split("/")[-1]
    name2 = dir2.split("/")[-1]

    print(f"Failures in {name1}:")
    print(f"HumanEval: {len(failed1_base)}, HumanEval+: {len(failed1_plus)}")
    print(f"\nFailures in {name2}:")
    print(f"HumanEval: {len(failed2_base)}, HumanEval+: {len(failed2_plus)}")

    print(f"\nTasks where {name1} passes but {name2} fails (HumanEval):")
    print(set(failed2_base) - set(failed1_base))

    print(f"\nTasks where {name2} passes but {name1} fails (HumanEval):")
    print(set(failed1_base) - set(failed2_base))

    print(f"\nTasks where {name1} passes but {name2} fails (HumanEval+):")
    print(set(failed2_plus) - set(failed1_plus))

    print(f"\nTasks where {name2} passes but {name1} fails (HumanEval+):")
    print(set(failed1_plus) - set(failed2_plus))


# Example usage
DIR1 = "Phi-3-mini-4k-instruct-output/none"
DIR2 = "Phi-3-mini-4k-instruct-output/planner"

compare_results(DIR1, DIR2)

Failures in none:
HumanEval: 59, HumanEval+: 63

Failures in planner:
HumanEval: 67, HumanEval+: 76

Tasks where none passes but planner fails (HumanEval):
{'HumanEval/18', 'HumanEval/128', 'HumanEval/119', 'HumanEval/78', 'HumanEval/144', 'HumanEval/123', 'HumanEval/67', 'HumanEval/57', 'HumanEval/101', 'HumanEval/113', 'HumanEval/19', 'HumanEval/124', 'HumanEval/40', 'HumanEval/148', 'HumanEval/85', 'HumanEval/20', 'HumanEval/103', 'HumanEval/90', 'HumanEval/114', 'HumanEval/11'}

Tasks where planner passes but none fails (HumanEval):
{'HumanEval/141', 'HumanEval/55', 'HumanEval/94', 'HumanEval/142', 'HumanEval/93', 'HumanEval/127', 'HumanEval/98', 'HumanEval/37', 'HumanEval/96', 'HumanEval/156', 'HumanEval/25', 'HumanEval/153'}

Tasks where none passes but planner fails (HumanEval+):
{'HumanEval/18', 'HumanEval/44', 'HumanEval/128', 'HumanEval/119', 'HumanEval/154', 'HumanEval/144', 'HumanEval/78', 'HumanEval/123', 'HumanEval/67', 'HumanEval/57', 'HumanEval/101', 'HumanEval/113', 'H