In [1]:
from datasets import load_dataset, IterableDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
from brx import BRX, sftoq, uif
from dotenv import load_dotenv
import os
from openai import OpenAI
from pydantic import BaseModel, Field, TypeAdapter
from typing import List

# load_dotenv("../../.env")
brx_client = BRX(os.environ["BRX_ACCESS_TOKEN"])
# openai_client = OpenAI(api_key=os.environ["TOGETHER_API_KEY"], base_url="https://api.together.xyz")
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

class PlanStep(BaseModel):
    filepath: str = Field(description="States the file where the changes should be made.")
    location: str = Field(description="Describes where in the file the code changes should be made. Very short code snippets can be used as reference.")
    code_change: str = Field(description="Write the new code that should be inserted and describe how it should be inserted in the code.")
    corresponding_sr_blocks: List[int] = Field(description="Which search-replace edit blocks does this instruction apply to?")

class PlanStepList(BaseModel):
    steps: List[PlanStep] = Field(description="A list of PlanSteps.")

def call_openai(file_contents, search_replaces, commit_message):
    system_prompt = """Given a file, a set of search-replace changes made by the developer, and the original commit message, write a plan that would have been followed by the developer to create the patch. Each step of the plan should be independently implementable. Clearly state variable, function, class, and type definitions wherever relevant.
    Format your response using JSON. Output a list of PlanStep objects under the attribute "steps".

    Each PlanStep object has the following properties:

    filepath: A string that specifies the path to the file where changes need to be made.
    location: A string that describes where in the file the changes should be made. In addition to a regular English description, you can add a very short code snippet used as a reference.
    code_change: A string that contains the new code to be inserted and a description of how it should be inserted into the code.
    corresponding_sr_blocks: A list of integers, indicating which search-replace edit blocks this instruction applies to.
    All these properties are required for a PlanStep object to be valid.

    Here's an example list of PlanStep objects, represented in JSON format:
    {
        "steps": [
            {
                "filepath": "/path/to/file1.py",
                "location": "def function_name:",
                "code_change": "Add a print statement to debug.\nprint('Debugging...')",
                "corresponding_sr_blocks": [1, 2]
            },
            {
                "filepath": "/path/to/file2.py",
                "location": "class MyClass:",
                "code_change": "Add a new method to log errors.\n\ndef log_error(self, error):\n    print(f'Error: {error}')",
                "corresponding_sr_blocks": [3]
            },
            {
                "filepath": "/path/to/file3.py",
                "location": "if condition:",
                "code_change": "Replace the condition with a more specific check.\nif specific_condition:",
                "corresponding_sr_blocks": [4, 5]
            }
        ]
    }

    It must be a list.
    """
    user_prompt = f"# File contents: \n {file_contents} \n \n ----- \n \n# Changes to the files: \n{search_replaces}\n\n -----\n# Original commit message:\n{commit_message}"
    # list_type_adapter = TypeAdapter(List[PlanStep])
    
    response = openai_client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        # model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        # model="togethercomputer/CodeLlama-34b-Instruct",
        model="gpt-4-turbo-preview",
        # response_format={"type": "json_object", "schema": PlanStepList.model_json_schema()}
        response_format={"type": "json_object"}
    )
    generated_breakdown = json.loads(response.choices[0].message.content)
    print("Generated breakdown: ", generated_breakdown)
    return generated_breakdown

def call_brx(file_contents, original_commit_message, applied_patch):
    schema = json.dumps({"description":"Given an original file, a starting commit message, and a patch applied, write a one-sentence description of changes made and a numbered step-by-step plan for generating those changes.","brxName":"Synthetic Dataset Generator for Patches","brxId":"bc7c33e6-8f01-4a34-8846-e084eaae4e84","dependantBrxIds":{},"processType":8,"schemas":{"mainBrxId":"bc7c33e6-8f01-4a34-8846-e084eaae4e84","schemas":{"_isMap":True,"data":[["main_brx_entry_schema",{"schemaFields":{"_isMap":True,"data":[["file_contents",{"fieldValueDataType":"string","fieldValue":"testval"}],["original_commit_message",{"fieldValueDataType":"string","fieldValue":"testval"}],["applied_patch",{"fieldValueDataType":"string","fieldValue":"testval"}]]},"brxName":"Synthetic Dataset Generator for Patches","brxId":"bc7c33e6-8f01-4a34-8846-e084eaae4e84"}]]}}})
    query_rebuild = sftoq(schema)
    output_object = query_rebuild["brx_query"]
    input_fields = query_rebuild["input_fields"]

    input_dict = {
        "file_contents": file_contents,
        "original_commit_message": original_commit_message,
        "applied_patch": applied_patch
    }

    for index, field in enumerate(input_fields):
        for dict_key in input_dict.keys():
            if field["name"] == dict_key:
                input_fields[index]["value"] = input_dict[dict_key]
        
    updated_query = uif(input_fields, output_object)
    result = json.loads(brx_client.execute(updated_query["brx_query"])["brxRes"]["output"])
    return result

In [3]:
def format_search_replace(block):
    return f"""```
{block.filepath}
<<<<<<< SEARCH
{block.search_block}
=======
{block.replace_block}
>>>>>>> REPLACE
```"""

In [4]:
test_openai = call_openai("Test", "test", "test")

Generated breakdown:  {'steps': [{'filepath': '/path/to/the/file', 'location': 'At the beginning of the file.', 'code_change': "Replace the word 'Test' with 'test'.\ntest", 'corresponding_sr_blocks': [1]}]}


In [16]:
from datasets import Dataset
import difflib
from superdocs_python.utils.diff_utils import parse_diff
import json

languages = ["python", "jsx", "kotlin", "vue", "xml", "javascript", "java", "c#", "ruby", "c", "c++", "typescript", "dart"]

def load_language(language_name, selected_length=15):
    print(f"Loading language: {language_name}")
    dataset = load_dataset("bigcode/commitpackft", language_name, split="train")
    dataset = dataset.shuffle(seed=42).select(range(selected_length))
    dataset = dataset.to_iterable_dataset()

    def generate_rows():
        for row in dataset:
            try:
                udiff = "\n".join(difflib.unified_diff(row['old_contents'].splitlines(), row['new_contents'].splitlines(), fromfile=row['old_file'], tofile=row['new_file'], n=3))
                search_replace_blocks = parse_diff(udiff)

                changes_list = ""
                for block_index, block in enumerate(search_replace_blocks):
                    if len(block.search_block.strip()) == 0 and len(block.replace_block.strip()) == 0:
                        continue
                    changes_list += f"[Search Replace Block {block_index + 1}]\n{format_search_replace(block)}\n"
                # print(udiff)
                # print(changes_list)
                # return

                generated_plan = call_openai(f"File contents: \n[{row['old_file']}]\n```\n{row['old_contents']}\n```", changes_list, row['subject'])["steps"]
                if not(type(generated_plan) == list):
                    generated_plan = [generated_plan]


                for step in generated_plan:
                    yield {"filepath": step["filepath"], 
                        "file": row['old_file'], 
                        "location": step["location"], 
                        "code_change": step["code_change"], 
                        "changes": "\n".join([format_search_replace(search_replace_blocks[idx - 1]) for idx in step["corresponding_sr_blocks"]])
                        }
            except Exception as e:
                print("There as an error loading: ", str(e))
    generated_rows = list(generate_rows())
    generated_dataset = Dataset.from_list(generated_rows)
    print(json.dumps(generated_rows, indent=4))
    return generated_dataset
    # generated_datasets.append(Dataset.from_generator(generate_rows))
    

In [17]:
generated_datasets = [load_language(language_name) for language_name in languages]
print(generated_datasets)


Loading language: python
Generated breakdown:  {'steps': [{'filepath': 'blaze/__init__.py', 'location': 'End of file or after the import statements.', 'code_change': 'Add a function to run the Blaze test suite using nose tests. This function should discover and run tests in any \'tests\' subdirectory within the Blaze module, handle verbosity levels, and optionally output results to an xunit file. It should look like this:\n\ndef test(verbosity=1, xunitfile=None, exit=False):\n    """\n    Runs the full Blaze test suite, outputting\n    the results of the tests to sys.stdout.\n\n    This uses nose tests to discover which tests to\n    run, and runs tests in any \'tests\' subdirectory\n    within the Blaze module.\n\n    Parameters\n    ----------\n        Value 0 prints very little, 1 prints a little bit,\n        and 2 prints the test names while testing.\n    xunitfile : string, optional\n        If provided, writes the test results to an xunit\n        style xml file. This is useful 

In [18]:
from datasets import concatenate_datasets
ds = concatenate_datasets(generated_datasets)
ds.save_to_disk("./plan_to_sr_v2")

Saving the dataset (1/1 shards): 100%|██████████| 304/304 [00:00<00:00, 32264.69 examples/s]


In [21]:
ds.push_to_hub("vdaita/commitpack-ft-sr")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 153.49ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.47s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/vdaita/commitpack-ft-sr/commit/4e0c3ae7954bdb32e3327b668b3e0144ad1df14b', commit_message='Upload dataset', commit_description='', oid='4e0c3ae7954bdb32e3327b668b3e0144ad1df14b', pr_url=None, pr_revision=None, pr_num=None)