In [33]:
import datasets
import re
import json
from tqdm import tqdm
import os
import requests
import random
from random import Random

In [2]:
dataset = datasets.load_dataset("hkust-nlp/CodeIO-PyEdu-Reasoning")['train']

## Extract the relevant parts of the prompt

In [6]:
pattern = re.compile(
    r'(?s)'  # DOTALL so . matches newlines
    r'You are given a question that requires some input and output variables as follows:\s*(.*?)'
    r'\s*The input and output requirements are as follows:\s*(.*?)'
    r'\s*Given the following.*?Tip: Here is a reference code snippet for this question\. '
    r'You can refer to this code to guide your reasoning but not copy spans of code directly\.\s*(.*)'
)

seen = set()
duplicate = 0

with open("data/codeio-pyedu-extracted.jsonl", "w+") as f:
    for i, item in tqdm(enumerate(dataset), total=len(dataset)):
        match = pattern.search(item["prompt"])
        if match:
            # Extract relevant info
            task_description = match.group(1).strip()
            input_output_spec = match.group(2).strip()
            code_sample = match.group(3).strip()

            # Check if code sample is unique
            hash_entry = f"{hash(task_description)}-{hash(input_output_spec)}-{hash(code_sample)}"
            if hash_entry in seen:
                duplicate += 1
                continue
            seen.add(hash_entry)

            # Save to disk
            json.dump({
                "task_description": task_description,
                "input_output_spec": input_output_spec,
                "code_sample": code_sample
            }, f)
            f.write("\n")
        else:
            print(f"No match found for item {i}")

print(f"There were {duplicate} out of {len(dataset)} duplicate entries")

100%|██████████| 1630607/1630607 [01:20<00:00, 20302.13it/s]

There were 1489543 out of 1630607 duplicate entries





## Subsample the data

In [None]:
import abc
from typing import Union

import numpy as np
import torch
import tqdm


class IdentitySampler:
    def run(
        self, features: Union[torch.Tensor, np.ndarray]
    ) -> Union[torch.Tensor, np.ndarray]:
        return features


class BaseSampler(abc.ABC):
    def __init__(self, percentage: float):
        if not 0 < percentage < 1:
            raise ValueError("Percentage value not in (0, 1).")
        self.percentage = percentage

    @abc.abstractmethod
    def run(
        self, features: Union[torch.Tensor, np.ndarray]
    ) -> Union[torch.Tensor, np.ndarray]:
        pass

    def _store_type(self, features: Union[torch.Tensor, np.ndarray]) -> None:
        self.features_is_numpy = isinstance(features, np.ndarray)
        if not self.features_is_numpy:
            self.features_device = features.device

    def _restore_type(self, features: torch.Tensor) -> Union[torch.Tensor, np.ndarray]:
        if self.features_is_numpy:
            return features.cpu().numpy()
        return features.to(self.features_device)


class GreedyCoresetSampler(BaseSampler):
    def __init__(
        self,
        percentage: float,
        device: torch.device,
        dimension_to_project_features_to=128,
    ):
        """Greedy Coreset sampling base class."""
        super().__init__(percentage)

        self.device = device
        self.dimension_to_project_features_to = dimension_to_project_features_to

    def _reduce_features(self, features):
        if features.shape[1] == self.dimension_to_project_features_to:
            return features
        mapper = torch.nn.Linear(
            features.shape[1], self.dimension_to_project_features_to, bias=False
        )
        _ = mapper.to(self.device)
        features = features.to(self.device)
        return mapper(features)

    def run(
        self, features: Union[torch.Tensor, np.ndarray]
    ) -> Union[torch.Tensor, np.ndarray]:
        """Subsamples features using Greedy Coreset.

        Args:
            features: [N x D]
        """
        if self.percentage == 1:
            return features
        self._store_type(features)
        if isinstance(features, np.ndarray):
            features = torch.from_numpy(features)
        reduced_features = self._reduce_features(features)
        sample_indices = self._compute_greedy_coreset_indices(reduced_features)
        features = features[sample_indices]
        return self._restore_type(features)

    @staticmethod
    def _compute_batchwise_differences(
        matrix_a: torch.Tensor, matrix_b: torch.Tensor
    ) -> torch.Tensor:
        """Computes batchwise Euclidean distances using PyTorch."""
        a_times_a = matrix_a.unsqueeze(1).bmm(matrix_a.unsqueeze(2)).reshape(-1, 1)
        b_times_b = matrix_b.unsqueeze(1).bmm(matrix_b.unsqueeze(2)).reshape(1, -1)
        a_times_b = matrix_a.mm(matrix_b.T)

        return (-2 * a_times_b + a_times_a + b_times_b).clamp(0, None).sqrt()

    def _compute_greedy_coreset_indices(self, features: torch.Tensor) -> np.ndarray:
        """Runs iterative greedy coreset selection.

        Args:
            features: [NxD] input feature bank to sample.
        """
        distance_matrix = self._compute_batchwise_differences(features, features)
        coreset_anchor_distances = torch.norm(distance_matrix, dim=1)

        coreset_indices = []
        num_coreset_samples = int(len(features) * self.percentage)

        for _ in range(num_coreset_samples):
            select_idx = torch.argmax(coreset_anchor_distances).item()
            coreset_indices.append(select_idx)

            coreset_select_distance = distance_matrix[
                :, select_idx : select_idx + 1  # noqa E203
            ]
            coreset_anchor_distances = torch.cat(
                [coreset_anchor_distances.unsqueeze(-1), coreset_select_distance], dim=1
            )
            coreset_anchor_distances = torch.min(coreset_anchor_distances, dim=1).values

        return np.array(coreset_indices)


class ApproximateGreedyCoresetSampler(GreedyCoresetSampler):
    def __init__(
        self,
        percentage: float,
        device: torch.device,
        number_of_starting_points: int = 10,
        dimension_to_project_features_to: int = 128,
    ):
        """Approximate Greedy Coreset sampling base class."""
        self.number_of_starting_points = number_of_starting_points
        super().__init__(percentage, device, dimension_to_project_features_to)

    def _compute_greedy_coreset_indices(self, features: torch.Tensor) -> np.ndarray:
        """Runs approximate iterative greedy coreset selection.

        This greedy coreset implementation does not require computation of the
        full N x N distance matrix and thus requires a lot less memory, however
        at the cost of increased sampling times.

        Args:
            features: [NxD] input feature bank to sample.
        """
        number_of_starting_points = np.clip(
            self.number_of_starting_points, None, len(features)
        )
        start_points = np.random.choice(
            len(features), number_of_starting_points, replace=False
        ).tolist()

        approximate_distance_matrix = self._compute_batchwise_differences(
            features, features[start_points]
        )
        approximate_coreset_anchor_distances = torch.mean(
            approximate_distance_matrix, axis=-1
        ).reshape(-1, 1)
        coreset_indices = []
        num_coreset_samples = int(len(features) * self.percentage)

        with torch.no_grad():
            for _ in tqdm.tqdm(range(num_coreset_samples), desc="Subsampling..."):
                select_idx = torch.argmax(approximate_coreset_anchor_distances).item()
                coreset_indices.append(select_idx)
                coreset_select_distance = self._compute_batchwise_differences(
                    features, features[select_idx : select_idx + 1]  # noqa: E203
                )
                approximate_coreset_anchor_distances = torch.cat(
                    [approximate_coreset_anchor_distances, coreset_select_distance],
                    dim=-1,
                )
                approximate_coreset_anchor_distances = torch.min(
                    approximate_coreset_anchor_distances, dim=1
                ).values.reshape(-1, 1)

        return np.array(coreset_indices)


class RandomSampler(BaseSampler):
    def __init__(self, percentage: float):
        super().__init__(percentage)

    def run(
        self, features: Union[torch.Tensor, np.ndarray]
    ) -> Union[torch.Tensor, np.ndarray]:
        """Randomly samples input feature collection.

        Args:
            features: [N x D]
        """
        num_random_samples = int(len(features) * self.percentage)
        subset_indices = np.random.choice(
            len(features), num_random_samples, replace=False
        )
        subset_indices = np.array(subset_indices)
        return features[subset_indices]

## Create input generators for each problem separately

In [None]:
SYSTEM_PROMPT = """You are a helpful assistant that generates valid Python functions that act as input generators for a given code snippet.

You have access to `random.Random`, therefore you SHOULD NOT import it again. You should use this random number generator to make the input generation process stochastic on each call.

When the user asks you to generate an input for a code snippet, you should strictly respond in the following format:
<function>
def generate_input(rng: Random) -> dict:
    # Your code here
    pass
</function>

The output of the function should be a dictionary where the keys are the variable names and the values are the generated values.

It must contain all the variables that listed in the user's input specification, or more precisely in the `main_solution` function signature. 
"""

USER_PROMPT = """Following are a task description, input/output specification, and relevant code snippet for a Python programming task.

<task_description>
{task_description}
</task_description>

<input_output_spec>
{input_output_spec}
</input_output_spec>

<code_sample>
{code_sample}
</code_sample>

Your task is to write a Python function `generate_input(rng: Random) -> dict` that generates valid inputs for the given code snippet, based on the provided information.
"""

with open("data/codeio-pyedu-extracted.jsonl", "r") as f:
    for i in range(1):
        entry = json.loads(f.readline())
        response = requests.post(
            url="https://openrouter.ai/api/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}",
                "Content-Type": "application/json",
            },
            data = json.dumps({
                "model": "deepseek/deepseek-chat",
                "messages": [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": USER_PROMPT.format(**entry)}
                ]
            })
        )
        full_response = response.json()["choices"][0]["message"]["content"]
        input_generator = re.search(r"<function>(.*?)</function>", full_response, re.DOTALL).group(1).strip()

        # local_dict = {}
        # exec(input_generator, globals(), local_dict)
        # generate_input_func = local_dict['generate_input']
        # rng = random.Random()

        # for i in range(5):
        #     random_input = generate_input_func(rng)
        #     print(f"[{i}]: {random_input}")

[0]: {'board': [[1, 0], [1, 1], [1, 0], [1, 0], [0, 0], [1, 1], [0, 0], [1, 0], [1, 1]]}
[1]: {'board': [[1, 1, 1, 0], [0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 0]]}
[2]: {'board': [[0]]}
[3]: {'board': [[0, 0, 1, 1, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 1, 0, 1, 0, 1, 0], [0, 1, 1, 0, 1, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1, 0, 1, 0, 1], [0, 0, 1, 1, 1, 1, 0, 0, 0, 1]]}
[4]: {'board': [[1, 0, 1, 1, 1], [0, 0, 0, 0, 0], [1, 0, 1, 1, 1], [1, 0, 0, 1, 0], [0, 0, 0, 1, 1], [1, 1, 1, 0, 0]]}
