In [1]:
from datasets import load_dataset
import re

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
DATASET_NAME = "verifiers-for-code/humaneval_plan_generation"
COL_TO_CLEAN = "phi3-planner-plans"
COL_CLEANED_NAME = "cleaned-" + COL_TO_CLEAN

In [13]:
dataset = load_dataset(DATASET_NAME, split="test")

In [14]:
def extract_plan(text):
    start_match = re.search(r'<plan>', text)
    if start_match:
        start_index = start_match.end()
        end_match = re.search(r'</plan>', text[start_index:])
        
        if end_match:
            end_index = start_index + end_match.start()
            plan = text[start_index:end_index].strip()
        else:
            # If no closing tag, include everything after <plan>
            plan = text[start_index:].strip()
        
        return plan.split('\n')
    else:
        print("No plan found in text")
    return []

# Apply the extraction to the dataset
dataset = dataset.map(lambda x: {COL_CLEANED_NAME: extract_plan(x[COL_TO_CLEAN])})

# Print a sample to verify
print(dataset[0][COL_CLEANED_NAME])

Map: 100%|██████████| 164/164 [00:00<00:00, 3056.71 examples/s]

['Action Plan:', '1. Understand the problem:', '    - The function takes a list of numbers and a threshold as input', '    - The goal is to check if any two numbers in the list are closer to each other than the given threshold', '', '2. Initialize a variable to store the result (default to False)', '', '3. Iterate through the list of numbers:', '    - For each number, iterate through the rest of the list (excluding the current number)', '    - Calculate the absolute difference between the current number and the other number', '    - Check if the difference is less than or equal to the threshold', '    - If true, set the result to True and break the loop', '', '4. Return the result:', '    - If the result is True, it means there are two numbers closer to each other than the threshold', '    - If the result is False, it means no two numbers are closer to each other than the threshold', '', 'Note: Use appropriate data structures and methods for iteration and comparison.', 'Be careful with




In [15]:
print(dataset[0]["prompt"])

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



In [16]:
import re
from datasets import load_dataset

DATASET_NAME = "verifiers-for-code/humaneval_plan_generation"
COL_TO_CLEAN = "self_planning_Phi-3-mini-4k-instruct"
COL_CLEANED_NAME = "cleaned-" + COL_TO_CLEAN

dataset = load_dataset(DATASET_NAME, split="test")

def extract_plan(text):
    start_match = re.search(r'<plan>', text)
    if start_match:
        start_index = start_match.end()
        end_match = re.search(r'</plan>', text[start_index:])
        
        if end_match:
            end_index = start_index + end_match.start()
            plan = text[start_index:end_index].strip()
        else:
            # If no closing tag, include everything after <plan>
            plan = text[start_index:].strip()
        
        return plan.split('\n')
    else:
        print("No plan found in text")
    return []

def insert_plan_into_docstring(prompt, plan):
    # Find the end of the docstring
    docstring_start = prompt.find('"""')
    docstring_end = prompt.find('"""', docstring_start + 3)
    
    # If there's no docstring or it's malformed, return the original prompt
    if docstring_end == -1:
        return prompt
    
    # Get the indentation of the closing docstring
    closing_indent = prompt.rfind('\n', 0, docstring_end) + 1
    indent = prompt[closing_indent:docstring_end].replace('"""', '')
    
    # Prepare the plan text with proper indentation, without "Plan:" header
    plan_text = "\n\n" + "\n".join(indent + line for line in plan)
    
    return prompt[:docstring_end] + plan_text + prompt[docstring_end:]

# Apply the extraction and insertion to the dataset
def process_item(item):
    plan = extract_plan(item[COL_TO_CLEAN])
    new_prompt = insert_plan_into_docstring(item['prompt'], plan)
    return {COL_CLEANED_NAME: new_prompt}

dataset = dataset.map(process_item)

# Print a sample to verify
print(dataset[56][COL_CLEANED_NAME])

Map: 100%|██████████| 164/164 [00:00<00:00, 3060.88 examples/s]



def correct_bracketing(brackets: str):
    """ brackets is a string of "<" and ">".
    return True if every opening bracket has a corresponding closing bracket.

    >>> correct_bracketing("<")
    False
    >>> correct_bracketing("<>")
    True
    >>> correct_bracketing("<<><>>")
    True
    >>> correct_bracketing("><<>")
    False
    

    Action Plan:
    1. Initialize an empty stack or list to keep track of opening brackets.
    2. Iterate through each character in the input string:
        a. If a character is an opening bracket ("<"), add it to the stack.
        b. If a character is a closing bracket (">"), check if it corresponds to the last opening bracket.
        c. If the stack is empty or the corresponding opening bracket doesn't match, return False.
    3. After the iteration, if the stack is empty, all brackets were properly matched and return True.
    4. If the stack is not empty, there are unpaired opening brackets and return False.
    5. Edge cases to consider




In [17]:
dataset.column_names

['task_id',
 'prompt',
 'canonical_solution',
 'test',
 'entry_point',
 'sonnet-3.5_gold_plans',
 'cleaned_sonnet-3.5_gold_plans',
 'generated_phi3_baseline',
 'generated_phi3_plan_generation',
 'phi3-planner-plans',
 'cleaned-phi3-planner-plans',
 'self_planning_Phi-3-mini-4k-instruct',
 'cleaned-self_planning_Phi-3-mini-4k-instruct']

In [18]:
dataset.push_to_hub(DATASET_NAME, split="test")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 68.64ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  5.70it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/verifiers-for-code/humaneval_plan_generation/commit/8848b9fff5048448edbe159674e4f521db30b1da', commit_message='Upload dataset', commit_description='', oid='8848b9fff5048448edbe159674e4f521db30b1da', pr_url=None, pr_revision=None, pr_num=None)