Skip to content

Commit

Permalink
Fix SWE-Bench evaluation due to setuptools version (OpenDevin#1995)
Browse files Browse the repository at this point in the history
* correctly setup plugins for swebench eval

* bump swe-bench version and add logging

* Revert "correctly setup plugins for swebench eval"

This reverts commit 2bd1055.

* bump version
  • Loading branch information
xingyaoww authored and super-dainiu committed May 23, 2024
1 parent 9d8d050 commit 862f96e
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 28 deletions.
4 changes: 2 additions & 2 deletions evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ Run the following command to do the above two steps. The results will be saved t

```bash
pushd evaluation/swe_bench
docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.0 -f ./scripts/docker/Dockerfile.full.v1.0 .
docker push ghcr.io/opendevin/eval-swe-bench:full-v1.0
docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
docker push ghcr.io/opendevin/eval-swe-bench:full-v1.1
```
2 changes: 1 addition & 1 deletion evaluation/swe_bench/EVAL_PATCH.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ Before evaluating generated patches, you need to set up the Docker environment.
```shell
docker run -it \
-v DIR_TO_YOUR_PATCH_FILES_ON_HOST:/swe_bench_output \
ghcr.io/opendevin/eval-swe-bench:full-v1.0 /bin/bash
ghcr.io/opendevin/eval-swe-bench:full-v1.1 /bin/bash
```

### Evaluate Model Generated Patches
Expand Down
4 changes: 2 additions & 2 deletions evaluation/swe_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ In [OpenDevin-SWE-Bench fork](https://github.com/OpenDevin/OD-SWE-bench.git) (mo
**We pack everything you need for SWE-Bench evaluation into one, gigantic, docker image.** To use it:

```bash
docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.0
docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.1
```

The Docker image contains several important directories:
Expand Down Expand Up @@ -68,7 +68,7 @@ temperature = 0.0

## Test if your environment works

Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.0`
Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.1`
docker image. Then run this python script:

```bash
Expand Down
2 changes: 2 additions & 0 deletions evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ def process_instance(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
else:
logger.info(f'Starting evaluation for instance {instance.instance_id}.')

if not skip_workspace_mount:
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
/eval_workspace/ /swe_util/

# pushd evaluation/SWE-bench
# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.0 -f ./scripts/docker/Dockerfile.full.v1.0 .
# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
2 changes: 1 addition & 1 deletion evaluation/swe_bench/scripts/eval_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ docker run --rm \
-e OD_SWE_BENCH=/swe_util/OD-SWE-bench \
-e EVAL_DATA_DIR=/swe_util/eval_data \
-w /swe_util \
ghcr.io/opendevin/eval-swe-bench:full-v1.0 \
ghcr.io/opendevin/eval-swe-bench:full-v1.1 \
bash -c "./get_agent_report.sh --output-file /swe_bench_output/$FILE_NAME \
--agent-name CodeActAgent \
--dataset swe-bench-test-lite \
Expand Down
35 changes: 14 additions & 21 deletions evaluation/swe_bench/swe_env_box.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import sys
import uuid

from datasets import load_dataset

from opendevin.core.config import config
from opendevin.core.logger import opendevin_logger as logger
from opendevin.runtime.docker.ssh_box import DockerSSHBox
from opendevin.runtime.plugins import JupyterRequirement, SWEAgentCommandsRequirement

SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.0'
SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.1'


class SWEBenchSSHBox(DockerSSHBox):
Expand Down Expand Up @@ -123,20 +125,15 @@ def get_diff_patch(self):


if __name__ == '__main__':
EXAMPLE_INSTANCE = {
'repo': 'django/django',
'instance_id': 'django__django-11099',
'base_commit': 'd26b2424437dabeeca94d7900b37d2df4410da0c',
'patch': "diff --git a/django/contrib/auth/validators.py b/django/contrib/auth/validators.py\n--- a/django/contrib/auth/validators.py\n+++ b/django/contrib/auth/validators.py\n@@ -7,7 +7,7 @@\n \n @deconstructible\n class ASCIIUsernameValidator(validators.RegexValidator):\n- regex = r'^[\\w.@+-]+$'\n+ regex = r'^[\\w.@+-]+\\Z'\n message = _(\n 'Enter a valid username. This value may contain only English letters, '\n 'numbers, and @/./+/-/_ characters.'\n@@ -17,7 +17,7 @@ class ASCIIUsernameValidator(validators.RegexValidator):\n \n @deconstructible\n class UnicodeUsernameValidator(validators.RegexValidator):\n- regex = r'^[\\w.@+-]+$'\n+ regex = r'^[\\w.@+-]+\\Z'\n message = _(\n 'Enter a valid username. This value may contain only letters, '\n 'numbers, and @/./+/-/_ characters.'\n",
'test_patch': "diff --git a/tests/auth_tests/test_validators.py b/tests/auth_tests/test_validators.py\n--- a/tests/auth_tests/test_validators.py\n+++ b/tests/auth_tests/test_validators.py\n@@ -237,7 +237,7 @@ def test_unicode_validator(self):\n invalid_usernames = [\n \"o'connell\", \"عبد ال\",\n \"zerowidth\\u200Bspace\", \"nonbreaking\\u00A0space\",\n- \"en\\u2013dash\",\n+ \"en\\u2013dash\", 'trailingnewline\\u000A',\n ]\n v = validators.UnicodeUsernameValidator()\n for valid in valid_usernames:\n@@ -250,7 +250,7 @@ def test_unicode_validator(self):\n \n def test_ascii_validator(self):\n valid_usernames = ['glenn', 'GLEnN', 'jean-marc']\n- invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\"]\n+ invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\", 'trailingnewline\\n']\n v = validators.ASCIIUsernameValidator()\n for valid in valid_usernames:\n with self.subTest(valid=valid):\n",
'problem_statement': "UsernameValidator allows trailing newline in usernames\nDescription\n\t\nASCIIUsernameValidator and UnicodeUsernameValidator use the regex \nr'^[\\w.@+-]+$'\nThe intent is to only allow alphanumeric characters as well as ., @, +, and -. However, a little known quirk of Python regexes is that $ will also match a trailing newline. Therefore, the user name validators will accept usernames which end with a newline. You can avoid this behavior by instead using \\A and \\Z to terminate regexes. For example, the validator regex could be changed to\nr'\\A[\\w.@+-]+\\Z'\nin order to reject usernames that end with a newline.\nI am not sure how to officially post a patch, but the required change is trivial - using the regex above in the two validators in contrib.auth.validators.\n",
'hints_text': '',
'created_at': '2019-03-20T03:46:18Z',
'version': '3.0',
'FAIL_TO_PASS': '["test_ascii_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_unicode_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_help_text (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)"]',
'PASS_TO_PASS': '["test_help_text (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_validate (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_help_text (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_validate_property (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_empty_password_validator_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_get_default_password_validators (auth_tests.test_validators.PasswordValidationTest)", "test_get_password_validators_custom (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed_with_custom_validator (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html_escaping (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_texts (auth_tests.test_validators.PasswordValidationTest)", "test_validate_password (auth_tests.test_validators.PasswordValidationTest)", "test_help_text (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_custom_list (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_django_supplied_file (auth_tests.test_validators.CommonPasswordValidatorTest)"]',
'environment_setup_commit': '419a78300f7cd27611196e1e464d50fd0385ff27',
}
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
swe_bench_tests = dataset['test'].to_pandas()

# INSTANCE_ID = 'django__django-11099'
INSTANCE_ID = 'astropy__astropy-12907'
swe_bench_tests = swe_bench_tests[swe_bench_tests['instance_id'] == INSTANCE_ID]
EXAMPLE_INSTANCE = swe_bench_tests.iloc[0].to_dict()

sandbox = SWEBenchSSHBox.get_box_for_instance(instance=EXAMPLE_INSTANCE)

Expand All @@ -154,9 +151,7 @@ def get_diff_patch(self):
logger.info(f'git apply $SWE_TASK_DIR/test.patch: {output}')

# TEST
exit_code, output = sandbox.execute(
'./tests/runtests.py --verbosity 2 auth_tests.test_validators'
)
exit_code, output = sandbox.execute('$TEST_CMD')
assert exit_code == 1, 'Expected exit code 1 (since this is a FAIL_TO_PASS)'
logger.info(f'$TEST_CMD:\n{output}')

Expand All @@ -166,9 +161,7 @@ def get_diff_patch(self):
logger.info(f'git apply $SWE_TASK_DIR/gold.patch: {output}')

# TEST
exit_code, output = sandbox.execute(
'./tests/runtests.py --verbosity 2 auth_tests.test_validators'
)
exit_code, output = sandbox.execute('$TEST_CMD')
assert exit_code == 0, 'Expected exit code 0 (since we applied the gold patch)'
logger.info(f'$TEST_CMD:\n{output}')

Expand Down

0 comments on commit 862f96e

Please sign in to comment.