In [5]:
from datasets import load_dataset, IterableDataset

In [6]:
import json
from brx import BRX, sftoq, uif
from dotenv import load_dotenv
import os
from openai import OpenAI
from pydantic import BaseModel, Field, TypeAdapter
from typing import List

# load_dotenv("../../.env")
brx_client = BRX(os.environ["BRX_ACCESS_TOKEN"])
openai_client = OpenAI(api_key=os.environ["TOGETHER_API_KEY"], base_url="https://api.together.xyz")
# openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])


class PlanStep(BaseModel):
    location: str = Field(description="Describes where in the file the code changes should be made. Very short code snippets can be used as reference.")
    instruction: str = Field(description="Write the new code that should be inserted and describe how it should be inserted in the code.")

def call_openai(file_contents, search_replaces, commit_message):
    system_prompt = """Given a file, a single search-replace change made by the developer and the original commit message, write an instruction that would have been followed by the developer to perform the search-replacement. Clearly state variable, function, class, and type definitions wherever relevant.
    Format your response using JSON. Output a list of PlanStep objects under the attribute "steps".

    Each PlanStep object has the following properties:

    filepath: A string that specifies the path to the file where changes need to be made.
    location: A string that describes where in the file the changes should be made. In addition to a regular English description, you can add a very short code snippet used as a reference.

    All these properties are required for a PlanStep object to be valid.

    Here's an example PlanStep object, represented in JSON format:
    {
        "location": "At the function definition of function name: \n ```def function_nam()e:```",
        "instruction": "Add a print statement to debug at the top of the function_name function. \nprint('Debugging...') ",
    }

    Here's another example PlanStep object:
    {
        "location": "Across the whole main function",
        "instruction": "Overwrite the function with this: 
```python
def main():
    print("Example")
```",
    }
    """
    user_prompt = f"# File contents: \n {file_contents} \n \n ----- \n \n# Changes to the files: \n{search_replaces}\n\n -----\n# Original commit message:\n{commit_message}"
    # list_type_adapter = TypeAdapter(List[PlanStep])
    
    response = openai_client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        # model="togethercomputer/CodeLlama-34b-Instruct",
        # model="gpt-4-turbo-preview",
        response_format={"type": "json_object", "schema": PlanStep.model_json_schema()}
        # response_format={"type": "json_object", "schema": PlanStepList.model_json_schema()}
        # response_format={"type": "json_object"}
    )
    generated_breakdown = json.loads(response.choices[0].message.content)
    print("Generated breakdown: ", generated_breakdown)
    return generated_breakdown

def call_brx(file_contents, original_commit_message, applied_patch):
    schema = json.dumps({"description":"Given an original file, a starting commit message, and a patch applied, write a one-sentence description of changes made and a numbered step-by-step plan for generating those changes.","brxName":"Synthetic Dataset Generator for Patches","brxId":"bc7c33e6-8f01-4a34-8846-e084eaae4e84","dependantBrxIds":{},"processType":8,"schemas":{"mainBrxId":"bc7c33e6-8f01-4a34-8846-e084eaae4e84","schemas":{"_isMap":True,"data":[["main_brx_entry_schema",{"schemaFields":{"_isMap":True,"data":[["file_contents",{"fieldValueDataType":"string","fieldValue":"testval"}],["original_commit_message",{"fieldValueDataType":"string","fieldValue":"testval"}],["applied_patch",{"fieldValueDataType":"string","fieldValue":"testval"}]]},"brxName":"Synthetic Dataset Generator for Patches","brxId":"bc7c33e6-8f01-4a34-8846-e084eaae4e84"}]]}}})
    query_rebuild = sftoq(schema)
    output_object = query_rebuild["brx_query"]
    input_fields = query_rebuild["input_fields"]

    input_dict = {
        "file_contents": file_contents,
        "original_commit_message": original_commit_message,
        "applied_patch": applied_patch
    }

    for index, field in enumerate(input_fields):
        for dict_key in input_dict.keys():
            if field["name"] == dict_key:
                input_fields[index]["value"] = input_dict[dict_key]
        
    updated_query = uif(input_fields, output_object)
    result = json.loads(brx_client.execute(updated_query["brx_query"])["brxRes"]["output"])
    return result

In [7]:
def format_search_replace(block):
    return f"""```
{block.filepath}
<<<<<<< SEARCH
{block.search_block}
=======
{block.replace_block}
>>>>>>> REPLACE
```"""

In [8]:
test_openai = call_openai("Test", "test", "test")

Generated breakdown:  {'location': "At the word 'Test' in the file", 'code_change': "Replace 'Test' with 'test'. This is a case-sensitive change, so make sure to replace 'Test' with 'test' and not 'TEST'. The word 'Test' is the first word in the file, so it should be easy to find and replace it accordingly."}


In [9]:
from datasets import Dataset
import difflib
from superdocs_python.utils.diff_utils import parse_diff
import json

languages = ["python", "jsx", "kotlin", "vue", "xml", "javascript", "java", "c#", "ruby", "c", "c++", "typescript", "dart"]

def load_language(language_name, selected_length=15):
    print(f"Loading language: {language_name}")
    dataset = load_dataset("bigcode/commitpackft", language_name, split="train")
    dataset = dataset.shuffle(seed=42).select(range(selected_length))
    dataset = dataset.to_iterable_dataset()

    def generate_rows():
        for row in dataset:
            try:
                udiff = "\n".join(difflib.unified_diff(row['old_contents'].splitlines(), row['new_contents'].splitlines(), fromfile=row['old_file'], tofile=row['new_file'], n=3))
                search_replace_blocks = parse_diff(udiff)

                for block in search_replace_blocks:
                    fmt_block = format_search_replace(block)
                    generated_inst = call_openai(f"File contents: \n[{row['old_file']}]\n```\n{row['old_contents']}\n```", fmt_block, row['subject'])
                    yield {
                        "filepath": row['old_file'],
                        "file_contents": row['old_contents'],
                        "location": generated_inst['location'],
                        "code_change_inst": generated_inst['instruction'],
                        "code_change": fmt_block
                    }
            except Exception as e:
                print("There as an error loading: ", str(e))
    generated_rows = list(generate_rows())
    generated_dataset = Dataset.from_list(generated_rows)
    print(json.dumps(generated_rows, indent=4))
    return generated_dataset
    # generated_datasets.append(Dataset.from_generator(generate_rows))
    

In [10]:
generated_datasets = [load_language(language_name) for language_name in languages]
print(generated_datasets)


Loading language: python
There as an error loading:  'instruction'
Generated breakdown:  {'location': 'At the glob.glob function call inside the setting_module fixture definition in the ideascube/tests/test_settings.py file', 'code_change': "Modify the glob.glob function call to exclude directories by adding the condition `if not f.endswith('/__init__.py')`. The modified fixture definition should look like this:\n```python\n@pytest.fixture(params=sorted([\n    f for f in glob.glob('ideascube/conf/*.py')\n    if not f.endswith('/__init__.py')\n]))\ndef setting_module(request):\n    basename = os.path.basename(request.param)\n```"}
There as an error loading:  'instruction'
Generated breakdown:  {'location': 'At the function definition of function prime_factors', 'code_change': 'Replace the function definition and its contents with the following:\n```python\ndef prime_factors(number):\n    num = 2\n    factors = []\n    while num <= number:\n        if (number % num) == 0:\n            nu

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading data: 100%|██████████| 21.6M/21.6M [00:00<00:00, 55.2MB/s]
Generating train split: 8506 examples [00:00, 37803.78 examples/s]


Generated breakdown:  {'location': 'In the main function definition in the file hr.c', 'code_change': 'Change the function signature from `int main (int argc, char * * argv)` to `int main (int argc, char * argv [])` and the assignment of COLS to `int COLS = ( w.ws_col <= 0 ? 80 : w.ws_col);`'}
There as an error loading:  'instruction'
Generated breakdown:  {'location': 'At the definition of kPHConnectionManagerApplication and kPHConnectionManagerDefaultRoomName constants in the PerchRTC/PHCredentials.h file', 'code_change': 'Modify the values of the constants as follows:\nstatic NSString *kPHConnectionManagerApplication = @"default";\nstatic NSString *kPHConnectionManagerDefaultRoomName = @"default";'}
There as an error loading:  'instruction'
Generated breakdown:  {'location': 'In the function prototype of List_Search function', 'code_change': 'Add a missing parenthesis around f as parameter.  \nChange this: int f(void*, void*);\nTo this: int (f)(void*, void*);'}
There as an error loa

In [11]:
from datasets import concatenate_datasets
ds = concatenate_datasets(generated_datasets)
ds.save_to_disk("./plan_to_sr_v2")

Saving the dataset (0/1 shards): 0 examples [00:00, ? examples/s]


SchemaInferenceError: Please pass `features` or at least one example when writing data

In [None]:
ds.push_to_hub("vdaita/commitpack-ft-sr")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 153.49ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.47s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/vdaita/commitpack-ft-sr/commit/4e0c3ae7954bdb32e3327b668b3e0144ad1df14b', commit_message='Upload dataset', commit_description='', oid='4e0c3ae7954bdb32e3327b668b3e0144ad1df14b', pr_url=None, pr_revision=None, pr_num=None)