In [33]:
import datasets
import re
import json
from tqdm import tqdm
import os
import requests
import random
from random import Random

In [2]:
dataset = datasets.load_dataset("hkust-nlp/CodeIO-PyEdu-Reasoning")['train']

## Extract the relevant parts of the prompt

In [6]:
pattern = re.compile(
    r'(?s)'  # DOTALL so . matches newlines
    r'You are given a question that requires some input and output variables as follows:\s*(.*?)'
    r'\s*The input and output requirements are as follows:\s*(.*?)'
    r'\s*Given the following.*?Tip: Here is a reference code snippet for this question\. '
    r'You can refer to this code to guide your reasoning but not copy spans of code directly\.\s*(.*)'
)

seen = set()
duplicate = 0

with open("data/codeio-pyedu-extracted.jsonl", "w+") as f:
    for i, item in tqdm(enumerate(dataset), total=len(dataset)):
        match = pattern.search(item["prompt"])
        if match:
            # Extract relevant info
            task_description = match.group(1).strip()
            input_output_spec = match.group(2).strip()
            code_sample = match.group(3).strip()

            # Check if code sample is unique
            hash_entry = f"{hash(task_description)}-{hash(input_output_spec)}-{hash(code_sample)}"
            if hash_entry in seen:
                duplicate += 1
                continue
            seen.add(hash_entry)

            # Save to disk
            json.dump({
                "task_description": task_description,
                "input_output_spec": input_output_spec,
                "code_sample": code_sample
            }, f)
            f.write("\n")
        else:
            print(f"No match found for item {i}")

print(f"There were {duplicate} out of {len(dataset)} duplicate entries")

100%|██████████| 1630607/1630607 [01:20<00:00, 20302.13it/s]

There were 1489543 out of 1630607 duplicate entries





## Create input generators for each problem separately

In [26]:
SYSTEM_PROMPT = """You are a helpful assistant that generates valid Python functions that act as input generators for a given code snippet.

You have access to `random.Random`, therefore you SHOULD NOT import it again. You should use this random number generator to make the input generation process stochastic on each call.

When the user asks you to generate an input for a code snippet, you should strictly respond in the following format:
<function>
def generate_input(rng: Random) -> dict:
    # Your code here
    pass
</function>

The output of the function should be a dictionary where the keys are the variable names and the values are the generated values.

It must contain all the variables that listed in the user's input specification, or more precisely in the `main_solution` function signature. 
"""

USER_PROMPT = """Following are a task description, input/output specification, and relevant code snippet for a Python programming task.

<task_description>
{task_description}
</task_description>

<input_output_spec>
{input_output_spec}
</input_output_spec>

<code_sample>
{code_sample}
</code_sample>

Your task is to write a Python function `generate_input(rng: Random) -> dict` that generates valid inputs for the given code snippet, based on the provided information.
"""

with open("data/codeio-pyedu-extracted.jsonl", "r") as f:
    for i in range(1):
        entry = json.loads(f.readline())
        response = requests.post(
            url="https://openrouter.ai/api/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}",
                "Content-Type": "application/json",
            },
            data = json.dumps({
                "model": "deepseek/deepseek-chat",
                "messages": [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": USER_PROMPT.format(**entry)}
                ]
            })
        )
        full_response = response.json()["choices"][0]["message"]["content"]
        input_generator = re.search(r"<function>(.*?)</function>", full_response, re.DOTALL).group(1).strip()

In [41]:
print(entry["task_description"])
print("----------------")
print(entry["input_output_spec"])
print("----------------")
print(entry["code_sample"])

In the context of Conway's Game of Life, a cellular automaton devised by John Horton Conway, consider a board with `m` by `n` cells, where each cell can be either live (1) or dead (0). The state of each cell evolves based on its neighbors according to specific rules. Given the current state of the board, what will be the state of the board after one iteration of the game?
----------------
Input:
  `board` (List[List[int]]): A 2D list representing the state of the board. Each element in the list is either `0` (dead cell) or `1` (live cell).

Output:
  `return` (List[List[int]]): A 2D list representing the next state of the board after applying the rules of Conway's Game of Life. Each element in the list is either `0` (dead cell) or `1` (live cell).
----------------
# import necessary packages
from collections import Counter

# all class and function definitions in the code file, if any
class Solution(object):
    def gameOfLifeInfinite(self, live):
        ctr = Counter((I, J)
         

In [27]:
print(input_generator)

def generate_input(rng: Random) -> dict:
    # Generate random dimensions for the board
    m = rng.randint(1, 10)  # Number of rows
    n = rng.randint(1, 10)  # Number of columns
    
    # Generate the board with random 0s and 1s
    board = [[rng.choice([0, 1]) for _ in range(n)] for _ in range(m)]
    
    return {'board': board}


In [39]:
local_dict = {}
exec(input_generator, globals(), local_dict)
generate_input_func = local_dict['generate_input']
rng = random.Random()

for i in range(5):
    random_input = generate_input_func(rng)
    print(f"[{i}]: {random_input}")

[0]: {'board': [[1, 0], [1, 1], [1, 0], [1, 0], [0, 0], [1, 1], [0, 0], [1, 0], [1, 1]]}
[1]: {'board': [[1, 1, 1, 0], [0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 0]]}
[2]: {'board': [[0]]}
[3]: {'board': [[0, 0, 1, 1, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 0, 1, 0, 1, 0], [0, 1, 1, 0, 1, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1, 0, 1, 0, 1], [0, 0, 1, 1, 1, 1, 0, 0, 0, 1]]}
[4]: {'board': [[1, 0, 1, 1, 1], [0, 0, 0, 0, 0], [1, 0, 1, 1, 1], [1, 0, 0, 1, 0], [0, 0, 0, 1, 1], [1, 1, 1, 0, 0]]}
