In [2]:
from datasets import load_dataset

ds = load_dataset("hackercupai/hackercup")
ds

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    sample: Dataset({
        features: ['name', 'year', 'round', 'statement', 'input', 'solution', 'code', 'output', 'sample_input', 'sample_output', 'images'],
        num_rows: 10
    })
    full: Dataset({
        features: ['name', 'year', 'round', 'statement', 'input', 'solution', 'code', 'output', 'sample_input', 'sample_output', 'images'],
        num_rows: 284
    })
})

In [6]:
ds['sample']['name']

['meta_game',
 'tower_rush',
 'ready_go_part_2',
 'wiki_race',
 'ready_go_part_1',
 'dim_sum_delivery',
 'cheeseburger_corollary_ch1',
 'two_apples_a_day',
 'cheeseburger_corollary_ch2',
 'road_to_nutella']

In [30]:
import io
from unittest.mock import patch
import traceback
import sys
import contextlib
import threading

def check_code_structure(extracted_code):
    # Check if the phrase "__name__ == '__main__'" is present in the code
    if "__name__ == '__main__'" not in extracted_code:
        return False, "Missing `if __name__ == '__main__':` block."
    return True, None

class TimeoutException(Exception):
    pass

def run_with_timeout(func, args, timeout):
    result = [None]
    exception = [None]

    def worker():
        try:
            result[0] = func(*args)
        except Exception as e:
            exception[0] = e

    thread = threading.Thread(target=worker)
    thread.start()
    thread.join(timeout)
    if thread.is_alive():
        raise TimeoutException(
            "The previous code execution timed out. This may indicate a performance issue, such as an infinite loop or inefficient logic. "
            "The input provided was sufficiently small and valid, so the problem is likely due to a flaw in the code logic rather than the input itself. "
            "Please review the code for potential errors or inefficiencies that could cause it to run indefinitely or take an excessive amount of time."
        )
    if exception[0]:
        raise exception[0]
    return result[0]

def check_code_structure(extracted_code):
    if "__name__ == '__main__'" not in extracted_code:
        return False, "Missing `if __name__ == '__main__':` block."
    return True, None

def run_extracted_code_with_timeout(extracted_code, test_input, timeout=20):
    is_valid, error_message = check_code_structure(extracted_code)
    if not is_valid:
        return None, error_message

    output = io.StringIO()
    error = None
    test_input_lines = [line.strip() for line in test_input.strip().split('\n') if line.strip()]

    def mock_input():
        if not test_input_lines:
            raise ValueError("No input data provided")
        return test_input_lines.pop(0)

    def execute_code():
        nonlocal error
        try:
            with patch('builtins.input', mock_input), contextlib.redirect_stdout(output):
                code_obj = compile(extracted_code, '<string>', 'exec')
                local_scope = {'__name__': '__main__'}
                exec(code_obj, local_scope)
        except Exception as e:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            tb = traceback.extract_tb(exc_traceback)
            line_no = tb[-1].lineno
            code_lines = extracted_code.split('\n')
            error_line = code_lines[line_no - 1] if line_no <= len(code_lines) else "Unknown"
            error = f"Error on line {line_no}: {error_line.strip()}\nException: {exc_type.__name__}: {str(exc_value)}"

    try:
        run_with_timeout(execute_code, (), timeout)
    except TimeoutException as e:
        return None, f"Error: {str(e)}"

    if error:
        return None, error

    result = output.getvalue()
    if not result.strip():
        return None, "Error: Code did not produce any output."

    return result, None

# Compare the output generated by the code with the expected output
def compare_with_expected_output(generated_output, expected_output):
    # Check if either output is None
    if generated_output is None:
        return 0, ["Generated output is None"]

    generated_output_lines = generated_output.strip().splitlines()
    expected_output_lines = expected_output.strip().splitlines()

    total_cases = len(expected_output_lines)
    matching_cases = 0
    failed_cases = []

    for i, (generated_line, expected_line) in enumerate(zip(generated_output_lines, expected_output_lines), start=1):
        if generated_line.strip() == expected_line.strip():
            matching_cases += 1
        else:
            failed_cases.append(f"Test Case #{i}: Expected '{expected_line}' but got '{generated_line}'")

    score = (matching_cases / total_cases) * 100 if total_cases > 0 else 0
    return score, failed_cases

# Evaluate generated code on test cases
def evaluate_generated_code_on_test_cases(extracted_code, test_input, test_output):
    # Run the code and get the output
    generated_output, error = run_extracted_code_with_timeout(extracted_code, test_input)
    
    if generated_output is None or generated_output.strip() == "":
        return 0, error or "Error: The code ran without any problem. There's no error in parsing the input. But it produces no output. Please check the entire code again.", generated_output, []
    
    # If there's an error, return it
    if error:
        return 0, error, generated_output, []

    # Compare the generated output with expected output
    score, failed_cases = compare_with_expected_output(generated_output, test_output)
    
    if failed_cases:
        error_msg = f"Test cases failed: {failed_cases}"
        return score, error_msg, generated_output, failed_cases

    return score, error, generated_output, failed_cases

In [33]:
extracted_code = "import a\n\ndef get_slope_intercept(a, b):\n    dx, dy = b[0] - a[0], b[1] - a[1]\n    if dx == 0:\n        return (math.inf, a[0])  # Vertical line\n    if dy == 0:\n        return (0, a[1])  # Horizontal line\n    slope = dy / dx\n    intercept = a[1] - slope * a[0]\n    return (slope, intercept)\n\ndef count_ants_on_line(ants, slope, intercept):\n    return sum(1 for ant in ants if abs(ant[1] - (slope * ant[0] + intercept)) < 1e-9)\n\ndef calculate_min_moves_to_align(ants):\n    lines = {}\n    for i in range(len(ants)):\n        for j in range(i + 1, len(ants)):\n            slope, intercept = get_slope_intercept(ants[i], ants[j])\n            if (slope, intercept) not in lines:\n                lines[(slope, intercept)] = []\n            lines[(slope, intercept)].append((ants[i], ants[j]))\n    max_aligned = 0\n    for line in lines.values():\n        unique_ants = set()\n        for pair in line:\n            unique_ants.update(pair)\n        max_aligned = max(max_aligned, len(unique_ants))\n    return len(ants) - max_aligned\n\nif __name__ == '__main__':\n    results = []\n    T = int(input())\n    for _ in range(T):\n        N = int(input())\n        ants = [tuple(map(int, input().split())) for _ in range(N)]\n        min_moves = calculate_min_moves_to_align(ants)\n        results.append(f'Case #{_ + 1}: {min_moves}')\n    print('\\n'.join(results))"

inputs = """
3
7
4 8
2 4
7 2
6 10
0 1
3 4
4 7
4
1 1
-1 1
1 -1
-1 -1
4
1 1
2 2
-3 -3
4 4
"""
output = """
Case #1: 3
Case #2: 2
Case #3: 0
"""

evaluate_generated_code_on_test_cases(extracted_code,inputs,output)

(0,
 "Error on line 1: import a\nException: ModuleNotFoundError: No module named 'a'",
 None,
 [])