In [23]:
!pip install -q datasets openai

      40
[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/pymagnitude-0.1.140-py3.11-macosx-14.0-arm64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached multiprocess-0.70.16-py31

In [96]:
from pathlib import Path
import json
import random

eval_dir = Path("evaluation/lite")
ds = load_dataset("princeton-nlp/SWE-bench_Lite")

In [None]:
from dataclasses import dataclass
from typing import *
# datasets.logging.set_verbosity(datasets.logging.CRITICAL)


@dataclass
class IssueSubmission:
    instance_id: str
    problem_statement: str
    model_patch: str
    gold_patch: str
    resolved_state: str
    submission_dir: str


instance_to_state = {}

def get_instances(max_items: Optional[int] = None, random_shuffle: bool = True) -> List[IssueSubmission]:
    instances = []

    all_dirs = list(eval_dir.iterdir())
    if random_shuffle:
        random.shuffle(all_dirs)

    for current_dir in all_dirs:
        print(f"Using SWEBench submission at {current_dir}")

        print("Loading results.json...")
        with open(current_dir / "results/results.json", "r") as file:
            swebench_results = json.load(file)
        print("Finished loading results.json")
    
        # Amend reverse map
        for state, instance_ids in swebench_results.items():
            for iid in instance_ids:
                instance_to_state[iid] = state

        print("Loading submitted preds...")
        with open(current_dir / "all_preds.jsonl", "r") as file:
            submitted_preds = [json.loads(line) for line in file]
        print(f"Loadeded submitted preds, length={len(submitted_preds)}")

        for submitted_pred in submitted_preds:
            instance_id, model_patch = submitted_pred["instance_id"], submitted_pred["model_patch"]
            resolved_state = instance_to_state[instance_id]

            issue_row = ds["test"].filter(lambda row: row["instance_id"] == instance_id, disable_tqdm=True)[0]
            gold_patch, problem_statement = issue_row["patch"], issue_row["problem_statement"]

            # print("Adding instance of id ", instance_id)
            instances.append(IssueSubmission(instance_id, problem_statement, model_patch, gold_patch, resolved_state, current_dir))

            if max_items is not None and len(instances) > max_items:
                print(f"Reached limit of {max_items} instances. Returning...")
                return instances
        print(f"Reached end of current dir {current_dir}. Instances: {len(instances)}")

    return instances

In [None]:
import openai
import os
import sys
import json

# Todo: replace this with corcel impl
openai.api_key = ""

def eval(instance_id, problem_statement, model_patch, gold_patch, resolved_state, submission_dir):
    LLM_PROMPT = """
You are being asked to identify important differences between 2 solutions to a given problem. The problem is as follows:
```
{problem_statement}
```
The 1st solution is:
```
{model_patch}
```
The second solution is:
```
{gold_patch}
```
First, list all the differences between these patches of code that you can think of. 
Then, let's assume that the second patch is a correct solution to the problem. 
Based on the similarity to this patch and the differences you outlined, would you expect the first patch to also be a correct solution? 
Reason about why or why not. 
Then, output a 1 if you think the first patch will address the problem, and a 0 if you think it will not. 
Do not use markdown, and do not say anything after the response digit.
"""
    prompt = LLM_PROMPT.format(model_patch=model_patch, gold_patch=gold_patch, problem_statement=problem_statement)
    # print(f"Issue state: {resolved_state}")
    # print(f"\n LLM Prompt:\n{prompt}")
    response = openai.ChatCompletion.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}],
    )
    comparison = response['choices'][0]['message']['content']
    # print(f"\nOutput:\n{comparison}")
    return comparison[-1]

instances = get_instances(max_items=100, random_shuffle=False)
outputs = []
for instance in instances:
    try:
        print(f"Instance ID: {instance.instance_id}")
        eval_output = eval(**vars(instance))
        print(f"Issue {instance.instance_id} with state {instance.resolved_state}, model output was {eval_output}")
        if eval_output.isdigit() and int(eval_output) in [0, 1]:
            outputs.append([instance.instance_id, int(eval_output), instance])
        else:
            print(f"Skipping eval of issue {instance.instance_id} because model output ended with '{eval_output}' instead of 0 or 1")
    except BaseError as e:
        print(f"Ran into error: {e}")
        continue

Using SWEBench submission at evaluation/lite/20240612_MASAI_gpt4o
Loading results.json...
Finished loading results.json
Loading submitted preds...
Loadeded submitted preds, length=287
Reached limit of 100 instances. Returning...
Instance ID: matplotlib__matplotlib-25311
Issue matplotlib__matplotlib-25311 with state applied, model output was 0
Instance ID: django__django-11179
Issue django__django-11179 with state resolved, model output was 0
Instance ID: sympy__sympy-22840
Issue sympy__sympy-22840 with state applied, model output was 0
Instance ID: django__django-11964
Issue django__django-11964 with state applied, model output was 0
Instance ID: django__django-12453
Issue django__django-12453 with state resolved, model output was 0
Instance ID: django__django-13315
Issue django__django-13315 with state applied, model output was 0
Instance ID: scikit-learn__scikit-learn-11281
Issue scikit-learn__scikit-learn-11281 with state applied, model output was 0
Instance ID: sympy__sympy-20639
I

In [192]:
# output
zeros = [instance_to_state[instance_id] for instance_id, v, issue_submission in outputs if v == 0]
ones = [instance_to_state[instance_id] for instance_id, v, issue_submission in outputs if v == 1]

from collections import Counter

print("zeros", Counter(zeros))
print("ones", Counter(ones))

zeros Counter({'applied': 23, 'no_generation': 8, 'resolved': 7})
ones Counter({'resolved': 27, 'applied': 22, 'no_generation': 3})


In [195]:
# output
zeros = [instance_to_state[instance_id] for instance_id, v, issue_submission in outputs if v == 0]
ones = [instance_to_state[instance_id] for instance_id, v, issue_submission in outputs if v == 1]

from collections import Counter

print("zeros", Counter(zeros))
print("ones", Counter(ones))

zeros Counter({'applied': 54, 'resolved': 4})
ones Counter({'resolved': 27, 'applied': 16})


In [None]:
# output
zeros = [instance_to_state[instance_id] for instance_id, v, issue_submission in outputs if v == 0]
ones = [instance_to_state[instance_id] for instance_id, v, issue_submission in outputs if v == 1]

from collections import Counter

print("zeros", Counter(zeros))
print("ones", Counter(ones))