In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load the dataset
dataset = load_dataset("verifiers-for-code/sampled_10k_from_27k", split='train')

# Load the tokenizer for the chat template
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

SYS = """
You are given the start of a function for a Python program. Your job is to produce a detailed plan. First, analyze and think about the function, then produce a plan. Do not generate any code. The function and docstring will be provided, so they do not need to be defined or initialized again within your plan.

Respond in the following format:

<thinking>
Your thought process and analysis of the function goes here. This should include considerations about the function's purpose, inputs, outputs, and any potential challenges or considerations.
</thinking>

<plan>
Your detailed plan for implementing the function goes here. This should outline the steps to implement the function without including actual code.
</plan>

Ensure your response follows this exact format, with the analysis enclosed in <thinking> tags and the plan enclosed in <plan> tags. The content within each tag should be a continuous paragraph without line breaks.
"""

def apply_template(example):
    chat_prompt = tokenizer.apply_chat_template(
        [
            {
                "role": "system",
                "content": SYS,
            },
            {"role": "user", "content": example['input']},
            {"role": "assistant", "content": example['gpt-4o-mini-plans']},
        ],
        tokenize=False
    )
    return {"text_llama": chat_prompt}

Downloading readme: 100%|██████████| 475/475 [00:00<00:00, 1.71MB/s]
Downloading data: 100%|██████████| 53.4M/53.4M [00:07<00:00, 7.09MB/s]
Generating train split: 100%|██████████| 10000/10000 [00:00<00:00, 22712.46 examples/s]


In [3]:
processed_dataset = dataset.map(apply_template)

# Keep only the 'text' column
# processed_dataset = processed_dataset.remove_columns([col for col in processed_dataset.column_names if col != 'text'])


Map: 100%|██████████| 10000/10000 [00:01<00:00, 6036.29 examples/s]


In [4]:
processed_dataset[0]

{'input': 'class Stack:\n    def __init__(self):\n        self.stack = []\n\n    def push(self, element):\n        self.stack.append(element)\n\n    def pop(self):\n        if self.is_empty():\n            raise IndexError("Stack is empty")\n        return self.stack.pop()\n\n    def is_empty(self):\n        return len(self.stack) == 0\n\n    def print_stack_in_reverse(self):\n        """\n        Prints the elements of the stack in reverse order.\n        \n        >>> stack = Stack()\n        >>> stack.push(1)\n        >>> stack.push(2)\n        >>> stack.push(3)\n        >>> stack.print_stack_in_reverse()\n        3\n        2\n        1\n        >>> stack = Stack()\n        >>> stack.print_stack_in_reverse()\n        Stack is empty\n        """',
 'code': 'Sure! Here\'s an example implementation of a function called `print_stack_in_reverse` that prints the elements of a stack in reverse order:\n\n```python\nclass Stack:\n    """\n    A class representing a stack data structure.\n  

In [5]:
processed_dataset.push_to_hub("verifiers-for-code/sampled_10k_from_27k")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 19.12ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.48s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/verifiers-for-code/sampled_10k_from_27k/commit/22f6cba15362bc567b4d6230dac5d2a6d421e470', commit_message='Upload dataset', commit_description='', oid='22f6cba15362bc567b4d6230dac5d2a6d421e470', pr_url=None, pr_revision=None, pr_num=None)