In [1]:
import pandas as pd
from pandas import Series
import json
import git
import os
import validators

# Evaluating the LLM-Agen on SWE-Benchmark

We have two datasets we can use for predicting `swe-bench.json` which has 2200 entries and `swe-bench-dev-dataset.json` which has 224 entries, they are from the [SWE-Bench](https://github.com/princeton-nlp/SWE-bench/tree/main).

In [2]:
df = pd.read_json("SWEBench/swe-bench-dev-dataset.json")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   repo                      225 non-null    object             
 1   instance_id               225 non-null    object             
 2   base_commit               225 non-null    object             
 3   patch                     225 non-null    object             
 4   test_patch                225 non-null    object             
 5   problem_statement         225 non-null    object             
 6   hints_text                225 non-null    object             
 7   created_at                225 non-null    datetime64[ns, UTC]
 8   version                   225 non-null    float64            
 9   FAIL_TO_PASS              225 non-null    object             
 10  PASS_TO_PASS              225 non-null    object             
 11  environment_setup_c

In [3]:
df.iloc[0]

repo                                                        sqlfluff/sqlfluff
instance_id                                           sqlfluff__sqlfluff-4764
base_commit                          a820c139ccbe6d1865d73c4a459945cd69899f8f
patch                       diff --git a/src/sqlfluff/cli/commands.py b/sr...
test_patch                  diff --git a/test/cli/commands_test.py b/test/...
problem_statement           Enable quiet mode/no-verbose in CLI for use in...
hints_text                                                                   
created_at                                          2023-04-16 14:24:42+00:00
version                                                                   1.4
FAIL_TO_PASS                [test/cli/commands_test.py::test__cli__fix_mul...
PASS_TO_PASS                [test/cli/commands_test.py::test__cli__command...
environment_setup_commit             d19de0ecd16d298f9e3bfb91da122734c40c01e5
Name: 0, dtype: object

After we used our LLM on the dataset to generate solutions to the problems, our output needs to be in the following format:
```
{
    "instance_id": "<Unique task instance ID>",
    "model_patch": "<.patch file content string>",
    "model_name_or_path": "<Model name here (i.e. SWE-Llama-13b)>",
}
```
With multiple prediction like this `[<prediction 1>, <prediction 2>,... <prediction n>]`.

**Example:**
```
{
    "instance_id": "django__django-15127",
    "model_name_or_path": "test",
    "model_patch": "--- a/django/contrib/messages/storage/base.py\n+++ b/django/contrib/messages/storage/base.py\n@@ -52,6 +52,7 @@\n                 if self._loaded_data is None:\n                     self._loaded_data = self.load()\n                 level, message, extra_tags = self._loaded_data\n+                extra_tags.update(self.get_level_tags())\n                 return {\n                     'message': message,\n                     'level': level,\n"
  },
``` 

# Generating our Predictions

## Defining the AgentWrapper

We first define an `AgentWrapper` which job it is to:
- clone the repos, set the head to the correct commit.
- Calls our internal Agent, which does the changes.
- Stages our changes.
- Calculates the git diff, which we return.

In [4]:
class AgentStub():
    def __init__(self):
        pass

    def predict(self, row_input, repo_dir):
        new_file = os.path.join(repo_dir, 'test_file.md')
        
        fp = open(new_file, 'w+')
        fp.write('This is a test file, which test if the git diff gets caluclated correctly.')
        fp.close()
        
        return ""

class AgentWrapper():
    def __init__(self, agent, working_directory="repos"):
        self.name = "stub"
        self.working_directory = working_directory
        self.agent = agent

        if not os.path.isdir(working_directory):
            os.makedirs(working_directory)

    def predict(self, row_input: Series):
        repo_dir = self._clone_repo(row_input["repo"], row_input["base_commit"])
        
        result = self.agent.predict(row_input, repo_dir)

        repo = git.Repo(repo_dir)
        repo.git.add("*")
        return repo.git.diff("--cached")

    def _clone_repo(self, repo_name: str, base_commit: str):
        repo_url = "https://github.com/" + repo_name
        repo_dir = os.path.join(self.working_directory, repo_name.split('/', 1)[1])
        
        if not validators.url(repo_url):
            raise Exception("The Repo url is not valid: " + repo_url)
                    
        if not os.path.isdir(repo_dir):
            os.makedirs(repo_dir)

            # clones the repo on which llm will work
            git.Repo.clone_from(repo_url, repo_dir)
        
        # we need to make sure we have the correct commit stage
        repo = git.Repo(repo_dir)
        repo.git.reset('--hard', base_commit)

        return repo_dir

## Testing AgentWrapper

Testing that the cloning mechanism for repos and checking out the correct git commit is working.

In [5]:
df.iloc[0]["base_commit"]

'a820c139ccbe6d1865d73c4a459945cd69899f8f'

In [6]:
stub = AgentStub()
agent = AgentWrapper(stub, "repos")

print(agent.name)
print("----------------")
print(agent.predict(df.iloc[0]))

stub
----------------
diff --git a/test_file.md b/test_file.md
new file mode 100644
index 000000000..76c846906
--- /dev/null
+++ b/test_file.md
@@ -0,0 +1 @@
+This is a test file, which test if the git diff gets caluclated correctly.
\ No newline at end of file


## Generating all Predictions

When running this on a server, it could happen that something crashed or an error is thrown which doesn't get catches, as such it is important to write the changes to disk for each entry in the dataset.


In [7]:
# This implementation uses checkpoints, this means if the program 
# is interuppted it can start again, where it left oft.

import tempfile
import json

stub = AgentStub()
agent = AgentWrapper(stub, "repos")

checkpoint_file = 'checkpoint.txt'
resume_index = 0

# Check if checkpoint file exists and read the last processed index
try:
    with open(checkpoint_file, 'r') as f:
        resume_index = int(f.read().strip())
except FileNotFoundError:
    pass
except Exception as e:
    print(f"Error reading checkpoint file: {e}")

if resume_index < len(df) - 1:
    # Open a file to save predictions
    with open('predictions.json', 'a') as json_file:
        if resume_index == 0:
            json_file.write('[')  # Start of JSON array
            json_file.write('\n')
        # Generating our solution
        for index, row in df.iterrows():
            # Skip rows that were already processed
            if index < resume_index:
                continue
    
            predictions = {
                "instance_id": row["instance_id"],
                "model_patch": agent.predict(row),
                "model_name_or_path": agent.name
            }
            # Convert the dictionary to a JSON formatted string and write to file
            json_data = json.dumps(predictions, indent=4)
            json_file.write(json_data)
            if index < len(df) - 1:
                json_file.write(',')
            json_file.write('\n')
    
            with open(checkpoint_file, 'w') as f:
                f.write(str(index))
                
        if index == len(df) - 1:
            json_file.write(']')