In [21]:
import re
from datasets import load_dataset

DATASET_NAME = "verifiers-for-code/humaneval_plan_generation"
COL_TO_CLEAN = "phi3-planner-granular"
COL_CLEANED_NAME = "cleaned-" + COL_TO_CLEAN

dataset = load_dataset(DATASET_NAME, split="test")

def extract_plan(text):
    start_match = re.search(r'<plan>', text)
    if start_match:
        start_index = start_match.end()
        end_match = re.search(r'</plan>', text[start_index:])
        
        if end_match:
            end_index = start_index + end_match.start()
            plan = text[start_index:end_index].strip()
        else:
            # If no closing tag, include everything after <plan>
            plan = text[start_index:].strip()
        
        return plan.split('\n')
    else:
        print("No plan found in text")
    return []

def insert_plan_into_docstring(prompt, plan):
    # Find the end of the docstring
    docstring_start = prompt.find('"""')
    docstring_end = prompt.find('"""', docstring_start + 3)
    
    # If there's no docstring or it's malformed, return the original prompt
    if docstring_end == -1:
        return prompt
    
    # Get the indentation of the closing docstring
    closing_indent = prompt.rfind('\n', 0, docstring_end) + 1
    indent = prompt[closing_indent:docstring_end].replace('"""', '')
    
    # Prepare the plan text with proper indentation, without "Plan:" header
    plan_text = "\n\n" + "\n".join(indent + line for line in plan)
    
    return prompt[:docstring_end] + plan_text + prompt[docstring_end:]

# Apply the extraction and insertion to the dataset
def process_item(item):
    plan = extract_plan(item[COL_TO_CLEAN])
    new_prompt = insert_plan_into_docstring(item['prompt'], plan)
    item[COL_CLEANED_NAME] = new_prompt  # Add the new column to the item
    return item  # Return the entire item with the new column

dataset = dataset.map(process_item)

# Print a sample to verify
print(dataset[56][COL_CLEANED_NAME])

Map: 100%|██████████| 164/164 [00:00<00:00, 2682.07 examples/s]



def correct_bracketing(brackets: str):
    """ brackets is a string of "<" and ">".
    return True if every opening bracket has a corresponding closing bracket.

    >>> correct_bracketing("<")
    False
    >>> correct_bracketing("<>")
    True
    >>> correct_bracketing("<<><>>")
    True
    >>> correct_bracketing("><<>")
    False
    

    1. Initialize an empty stack to store opening brackets
       - This stack will keep track of the opening brackets encountered
    
    2. Iterate through each character in the input string:
       a. If the character is "<":
          - Push it onto the stack
       b. If the character is ">":
          - Check if the stack is empty
             - If empty, return False (no matching opening bracket)
          - Pop the top element from the stack (closing bracket)
    
    3. After iterating through the entire string:
       - Check if the stack is empty
          - If empty, return True (all opening brackets have matching closing brackets)
 




In [22]:
dataset.column_names

['task_id',
 'prompt',
 'canonical_solution',
 'test',
 'entry_point',
 'sonnet-3.5_gold_plans',
 'cleaned_sonnet-3.5_gold_plans',
 'generated_phi3_baseline',
 'generated_phi3_plan_generation',
 'phi3-planner-plans',
 'cleaned-phi3-planner-plans',
 'self_planning_Phi-3-mini-4k-instruct',
 'cleaned-self_planning_Phi-3-mini-4k-instruct',
 'phi3-planner-granular',
 'cleaned-phi3-planner-granular']

In [23]:
dataset.push_to_hub(DATASET_NAME, split="test")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 85.55ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/verifiers-for-code/humaneval_plan_generation/commit/357d582d94700c3ee6d3a7aff12cfb1851dd5142', commit_message='Upload dataset', commit_description='', oid='357d582d94700c3ee6d3a7aff12cfb1851dd5142', pr_url=None, pr_revision=None, pr_num=None)