In [1]:
import os
import json
import traceback
import sys
from io import StringIO

GENERATED_DIR = "generated"
PROBLEMS_FILE = "problems.json"


def check_correctness(problem, code_file):
    """Execute the code file and run HumanEval+ tests, tracking individual test results."""
    try:
        # Create a namespace for execution
        namespace = {}
        
        # Load and execute user's code
        with open(code_file, "r") as f:
            code = f.read()
        exec(code, namespace)
        
        # Check if function exists
        entry_point = problem['entry_point']
        if entry_point not in namespace:
            return {
                "error": f"Function '{entry_point}' not found in the code",
                "tests_passed": 0,
                "tests_total": 0
            }
        
        candidate = namespace[entry_point]
        
        # Execute the test code to get the check function
        exec(problem['test'], namespace)
        
        if 'check' not in namespace:
            return {
                "error": "Test function 'check' not found",
                "tests_passed": 0,
                "tests_total": 0
            }
        
        # Now we need to count individual test cases
        # We'll modify the check function to run tests individually
        test_code = problem['test']
        
        # Parse the test code to extract inputs
        # The test code has a structure with 'inputs = [...]'
        exec_ns = {}
        exec(test_code, exec_ns)
        
        # Try to get inputs from the check function's local scope
        # We'll need to extract this more carefully
        inputs = []
        if 'inputs' in test_code:
            # Execute just the inputs part
            try:
                for line in test_code.split('\n'):
                    if line.strip().startswith('inputs = '):
                        exec(line, exec_ns)
                        inputs = exec_ns.get('inputs', [])
                        break
            except:
                pass
        
        total_tests = len(inputs) if inputs else 0
        
        # If we can't extract inputs, just run the check function normally
        if total_tests == 0:
            try:
                namespace['check'](candidate)
                # If it passes, we don't know how many tests, so estimate
                return {
                    "passed": True,
                    "error": None,
                    "tests_passed": "unknown",
                    "tests_total": "unknown"
                }
            except AssertionError as e:
                return {
                    "passed": False,
                    "error": f"Test failed: {str(e)}",
                    "tests_passed": "unknown",
                    "tests_total": "unknown"
                }
            except Exception as e:
                return {
                    "passed": False,
                    "error": f"Runtime error: {str(e)}",
                    "tests_passed": 0,
                    "tests_total": 0
                }
        
        # Run each test individually
        passed_tests = 0
        failed_tests = []
        
        for i, test_input in enumerate(inputs):
            try:
                result = candidate(*test_input)
                # We can't verify correctness without expected output
                # So we just check if it runs without error
                passed_tests += 1
            except Exception as e:
                failed_tests.append((i, test_input, str(e)))
        
        # Also run the full check to see if assertions pass
        try:
            namespace['check'](candidate)
            all_passed = True
        except AssertionError as e:
            all_passed = False
            error_msg = str(e)
        except Exception as e:
            all_passed = False
            error_msg = str(e)
        
        if all_passed:
            return {
                "passed": True,
                "error": None,
                "tests_passed": total_tests,
                "tests_total": total_tests
            }
        else:
            return {
                "passed": False,
                "error": error_msg if 'error_msg' in locals() else "Some assertions failed",
                "tests_passed": passed_tests,
                "tests_total": total_tests,
                "failed_test_samples": failed_tests[:3]
            }

    except Exception as e:
        return {
            "error": f"Execution failed: {str(e)}\n{traceback.format_exc()}",
            "tests_passed": 0,
            "tests_total": 0
        }


def main():
    with open(PROBLEMS_FILE) as f:
        problems = json.load(f)

    results = []

    for file in sorted(os.listdir(GENERATED_DIR)):
        if not file.endswith(".py"):
            continue

        file_path = os.path.join(GENERATED_DIR, file)
        parts = file[:-3].split("_")
        if len(parts) < 4:
            print(f"Skipping {file} (invalid name format)")
            continue

        # Parse: humaneval_HumanEval_X_model_strategy
        problem_id = "_".join(parts[:3])  # humaneval_HumanEval_X
        model = parts[3]
        strategy = parts[4] if len(parts) > 4 else "unknown"

        problem = next((p for p in problems if p["id"] == problem_id), None)
        if not problem:
            print(f"⚠️ Problem {problem_id} not found in problems.json")
            continue

        print(f"\n=== Running {file} ===")
        print(f"→ Problem: {problem_id}, Model: {model}, Strategy: {strategy}")

        result = check_correctness(problem, file_path)

        tests_passed = result.get('tests_passed', 0)
        tests_total = result.get('tests_total', 0)
        
        if "error" in result and result["error"]:
            print(f"❌ Failed: {result['error']}")
            if tests_total:
                print(f"   Tests: {tests_passed}/{tests_total}")
        else:
            if result.get("passed"):
                print(f"✅ All tests passed! ({tests_passed}/{tests_total})")
            else:
                print(f"⚠️ Partial pass: {tests_passed}/{tests_total} tests")

        results.append({
            "file": file,
            "problem_id": problem_id,
            "model": model,
            "strategy": strategy,
            "passed": result.get("passed", False),
            "error": result.get("error"),
            "tests_passed": tests_passed,
            "tests_total": tests_total
        })

    # Save summary
    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

    print("\n" + "="*70)
    print("Results saved to results.json")
    
    # Print summary statistics
    total_solutions = len(results)
    passed_solutions = sum(1 for r in results if r.get('passed', False))
    failed_solutions = total_solutions - passed_solutions
    
    # Calculate test statistics
    total_tests_sum = sum(r.get('tests_total', 0) for r in results if isinstance(r.get('tests_total'), int))
    passed_tests_sum = sum(r.get('tests_passed', 0) for r in results if isinstance(r.get('tests_passed'), int))
    
    if total_solutions > 0:
        print(f"\n{'='*70}")
        print(f"SUMMARY: {passed_solutions}/{total_solutions} solutions passed")
        print(f"{'='*70}")
        print(f"Solutions:")
        print(f"  ✅ Passed:  {passed_solutions:3d} ({100*passed_solutions/total_solutions:5.1f}%)")
        print(f"  ❌ Failed:  {failed_solutions:3d} ({100*failed_solutions/total_solutions:5.1f}%)")
        
        if total_tests_sum > 0:
            print(f"\nIndividual Tests:")
            print(f"  ✅ Passed:  {passed_tests_sum:4d}/{total_tests_sum:4d} ({100*passed_tests_sum/total_tests_sum:5.1f}%)")
            print(f"  ❌ Failed:  {total_tests_sum - passed_tests_sum:4d}/{total_tests_sum:4d} ({100*(total_tests_sum - passed_tests_sum)/total_tests_sum:5.1f}%)")
        
        print(f"{'='*70}")
        
        # Breakdown by model and strategy
        from collections import defaultdict
        by_model = defaultdict(lambda: {'passed': 0, 'total': 0, 'tests_passed': 0, 'tests_total': 0})
        by_strategy = defaultdict(lambda: {'passed': 0, 'total': 0, 'tests_passed': 0, 'tests_total': 0})
        
        for r in results:
            model = r.get('model', 'unknown')
            strategy = r.get('strategy', 'unknown')
            passed = r.get('passed', False)
            tests_p = r.get('tests_passed', 0) if isinstance(r.get('tests_passed'), int) else 0
            tests_t = r.get('tests_total', 0) if isinstance(r.get('tests_total'), int) else 0
            
            by_model[model]['total'] += 1
            by_strategy[strategy]['total'] += 1
            by_model[model]['tests_passed'] += tests_p
            by_model[model]['tests_total'] += tests_t
            by_strategy[strategy]['tests_passed'] += tests_p
            by_strategy[strategy]['tests_total'] += tests_t
            
            if passed:
                by_model[model]['passed'] += 1
                by_strategy[strategy]['passed'] += 1
        
        if len(by_model) > 1:
            print("\nBy Model:")
            for model, stats in sorted(by_model.items()):
                sol_pct = 100 * stats['passed'] / stats['total'] if stats['total'] > 0 else 0
                test_pct = 100 * stats['tests_passed'] / stats['tests_total'] if stats['tests_total'] > 0 else 0
                print(f"  {model:10s}: {stats['passed']:2d}/{stats['total']:2d} solutions ({sol_pct:5.1f}%) | {stats['tests_passed']:4d}/{stats['tests_total']:4d} tests ({test_pct:5.1f}%)")
        
        if len(by_strategy) > 1:
            print("\nBy Strategy:")
            for strategy, stats in sorted(by_strategy.items()):
                sol_pct = 100 * stats['passed'] / stats['total'] if stats['total'] > 0 else 0
                test_pct = 100 * stats['tests_passed'] / stats['tests_total'] if stats['tests_total'] > 0 else 0
                print(f"  {strategy:10s}: {stats['passed']:2d}/{stats['total']:2d} solutions ({sol_pct:5.1f}%) | {stats['tests_passed']:4d}/{stats['tests_total']:4d} tests ({test_pct:5.1f}%)")
        
        # Show which solutions passed
        if passed_solutions > 0:
            print(f"\n✅ Passed solutions:")
            for r in results:
                if r.get('passed', False):
                    tp = r.get('tests_passed', '?')
                    tt = r.get('tests_total', '?')
                    print(f"   - {r['file']} ({tp}/{tt} tests)")
        
        print(f"\n{'='*70}")


if __name__ == "__main__":
    main()


=== Running humaneval_HumanEval_1_Llama_3.2_1B_Instruct_cot.py ===
→ Problem: humaneval_HumanEval_1, Model: Llama, Strategy: 3.2
❌ Failed: Execution failed: invalid syntax. Perhaps you forgot a comma? (<string>, line 52)
Traceback (most recent call last):
  File "/var/folders/z4/0lpbcqpj0vg_4p2l4z8md5cc0000gn/T/ipykernel_36216/711688988.py", line 20, in check_correctness
    exec(code, namespace)
    ~~~~^^^^^^^^^^^^^^^^^
  File "<string>", line 52
    current_group = "".join(paren_string[:paren_string.index(char ill елем список результат.append(current_group) if balance > 0 else '')
                                                             ^^^^^^^^
SyntaxError: invalid syntax. Perhaps you forgot a comma?


=== Running humaneval_HumanEval_1_OLMo_2_1B_cot.py ===
→ Problem: humaneval_HumanEval_1, Model: OLMo, Strategy: 2
❌ Failed: Execution failed: invalid syntax (<string>, line 33)
Traceback (most recent call last):
  File "/var/folders/z4/0lpbcqpj0vg_4p2l4z8md5cc0000gn/T/ipykernel_