In [1]:
from datasets import load_dataset

ds = load_dataset("nuprl/EditPackFT", split="train")
ds = ds.select(range(5000))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import difflib
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv(".env")
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

def proc_row(row):
    row["patch"] = "\n".join(difflib.unified_diff(row["old_contents"].splitlines(), row["new_contents"].splitlines(), n=3))
    if os.path.exists(f"./cache/{row['commit']}.txt"):
        f = open(f"./cache/{row['commit']}.txt", "r")
        row["inst"] = f.read()
        f.close()
        return row
    
    # Write the commit
    prompt = f"""Given the following file, the corresponding patch made, and the commit message, write a detailed instruction given to a developer in order to produce the patch.
# File:
{row['old_contents']}

# Commit message:
{row['message']}

# Patch:
{row['patch']}

# Describe the changes made in the code in a concise format (two sentences), formatted as an instruction (imperative tense) to an intelligent and independent software developer. Don't mention line numbers or write out long chunks of code (more than 3 lines). Include 1-2 line chunks of code in your instruction if required."""
    
    detailed_inst = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        model="gpt-3.5-turbo"
    ).choices[0].message.content

    f = open(f"./cache/{row['commit']}.txt", "w+")
    f.write(detailed_inst)
    f.close()

    row["inst"] = detailed_inst 
    return row

if not(os.path.exists("./cache")):
    os.mkdir("./cache")

In [3]:
new_row = proc_row(ds[0])
print(new_row["patch"])
print(new_row["inst"])

--- 

+++ 

@@ -9,6 +9,9 @@

 class Broker(object):
     def __init__(self, config):
         self.connection = BrokerConnection(**config)
+        with producers[self.connection].acquire(block=False) as producer:
+            for queue in task_queues:
+                maybe_declare(queue, producer.channel)
 
     def delay(self, func, *args, **kwargs):
         payload = {
@@ -18,8 +21,6 @@

         }
 
         with producers[self.connection].acquire(block=False) as producer:
-            for queue in task_queues:
-                maybe_declare(queue, producer.channel)
             producer.publish(payload,
                 exchange=task_exchange,
                 serializer="pickle",
1. Adjust the `__init__` method in the `Broker` class to include a loop that declares queues using `maybe_declare`.
2. Move the existing loop that declares queues in the `delay` method above the `producer.publish` call.


In [7]:
ds = ds.map(proc_row, num_proc=10)

Map (num_proc=10): 100%|██████████| 5000/5000 [04:46<00:00, 17.46 examples/s]   
  table = cls._concat_blocks(blocks, axis=0)


In [8]:
ds.push_to_hub("vdaita/editpackft_inst")

Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 32.93ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:12<00:00, 12.35s/it]
