In [2]:
import json
from datasets import load_dataset

In [3]:
def format_content(analysis, plan):
    formatted = f"<thinking>\n{analysis}\n</thinking>\n<plan>\n"
    if isinstance(plan, list):
        for step in plan:
            formatted += f"{step}\n"
    else:
        formatted += f"{plan}\n"
    formatted += "</plan>"
    return formatted

In [4]:
dataset = load_dataset("verifiers-for-code/fitlered-1k-from-546k")

In [5]:
jsonl_data = []
with open("../data/processed_batch_output_1k_from_546k.jsonl", "r") as f:
    try:
        data = json.load(f)
        jsonl_data = data.get("jsons", [])
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON file: {str(e)}")
        jsonl_data = []

# Check if we have the same number of items
if len(jsonl_data) != len(dataset['train']):
    print(f"Warning: JSONL file ({len(jsonl_data)} items) and dataset ({len(dataset['train'])} items) have different lengths")


In [6]:
gpt4_mini_plans = []

# Process each row
for item in jsonl_data:
    analysis = item.get('analysis', "No analysis available")
    plan = item.get('plan', "No plan available")
    formatted_content = format_content(analysis, plan)
    gpt4_mini_plans.append(formatted_content)

In [7]:
min_length = min(len(dataset['train']), len(gpt4_mini_plans))
dataset = dataset['train'].select(range(min_length)).add_column('gpt-4o-plans', gpt4_mini_plans[:min_length])

In [8]:
dataset

Dataset({
    features: ['input', 'solution', 'plan', 'gpt-4o-plans'],
    num_rows: 1000
})

In [9]:
dataset[100]

{'input': 'def find_greater_than(numbers, threshold):\n\n    """\n    Given a list of numbers and a threshold, find all the numbers in the list that are greater than the threshold.\n    \n    Args:\n    - numbers: A list of numbers (e.g., [1, 5, 10, 2])\n    - threshold: An integer representing the threshold value\n    \n    Returns:\n    - A list of numbers greater than the threshold\n    \n    Example:\n    >>> find_greater_than([1, 5, 10, 2], 3)\n    [5, 10]\n    """',
 'solution': '\n    \n    # Initialize an empty list to store the numbers greater than the threshold\n    greater_than_threshold = []\n    \n    # Iterate over each number in the list\n    for number in numbers:\n        \n        # Check if the number is greater than the threshold\n        if number > threshold:\n            \n            # If it is, add the number to the list\n            greater_than_threshold.append(number)\n    \n    # Return the list of numbers greater than the threshold\n    return greater_than

In [10]:
dataset.push_to_hub("verifiers-for-code/fitlered-1k-from-546k")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 69.67ba/s]


Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.73it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/verifiers-for-code/fitlered-1k-from-546k/commit/f8fd73440e3900fb5600261ef16d9a574de967f4', commit_message='Upload dataset', commit_description='', oid='f8fd73440e3900fb5600261ef16d9a574de967f4', pr_url=None, pr_revision=None, pr_num=None)