Fix SWE-Bench evaluation due to setuptools version (OpenDevin#1995)

* correctly setup plugins for swebench eval * bump swe-bench version and add logging * Revert "correctly setup plugins for swebench eval" This reverts commit 2bd1055. * bump version
super-dainiu · May 23, 2024 · 862f96e · 862f96e
1 parent 9d8d050
commit 862f96e
Show file tree

Hide file tree

Showing 7 changed files with 23 additions and 28 deletions.
diff --git a/evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md b/evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md
@@ -34,6 +34,6 @@ Run the following command to do the above two steps. The results will be saved t
 
 ```bash
 pushd evaluation/swe_bench
-docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.0 -f ./scripts/docker/Dockerfile.full.v1.0 .
-docker push ghcr.io/opendevin/eval-swe-bench:full-v1.0
+docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
+docker push ghcr.io/opendevin/eval-swe-bench:full-v1.1
 ```
diff --git a/evaluation/swe_bench/EVAL_PATCH.md b/evaluation/swe_bench/EVAL_PATCH.md
@@ -117,7 +117,7 @@ Before evaluating generated patches, you need to set up the Docker environment.
 ```shell
 docker run -it \
 -v DIR_TO_YOUR_PATCH_FILES_ON_HOST:/swe_bench_output \
-ghcr.io/opendevin/eval-swe-bench:full-v1.0 /bin/bash
+ghcr.io/opendevin/eval-swe-bench:full-v1.1 /bin/bash
 ```
 
 ### Evaluate Model Generated Patches

diff --git a/evaluation/swe_bench/README.md b/evaluation/swe_bench/README.md
@@ -15,7 +15,7 @@ In [OpenDevin-SWE-Bench fork](https://github.com/OpenDevin/OD-SWE-bench.git) (mo
 **We pack everything you need for SWE-Bench evaluation into one, gigantic, docker image.** To use it:
 
 ```bash
-docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.0
+docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.1
 ```
 
 The Docker image contains several important directories:
@@ -68,7 +68,7 @@ temperature = 0.0
 
 ## Test if your environment works
 
-Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.0`
+Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.1`
 docker image. Then run this python script:
 
 ```bash

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
@@ -222,6 +222,8 @@ def process_instance(
             logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
         )
         logger.addHandler(file_handler)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
 
     if not skip_workspace_mount:
         logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')

diff --git a/...bench/scripts/docker/Dockerfile.full.v1.0 → ...bench/scripts/docker/Dockerfile.full.v1.1 b/...bench/scripts/docker/Dockerfile.full.v1.0 → ...bench/scripts/docker/Dockerfile.full.v1.1
@@ -10,4 +10,4 @@ RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
     /eval_workspace/ /swe_util/
 
 # pushd evaluation/SWE-bench
-# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.0 -f ./scripts/docker/Dockerfile.full.v1.0 .
+# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
diff --git a/evaluation/swe_bench/scripts/eval_infer.sh b/evaluation/swe_bench/scripts/eval_infer.sh
@@ -26,7 +26,7 @@ docker run --rm \
     -e OD_SWE_BENCH=/swe_util/OD-SWE-bench \
     -e EVAL_DATA_DIR=/swe_util/eval_data \
     -w /swe_util \
-    ghcr.io/opendevin/eval-swe-bench:full-v1.0 \
+    ghcr.io/opendevin/eval-swe-bench:full-v1.1 \
     bash -c "./get_agent_report.sh --output-file /swe_bench_output/$FILE_NAME \
     --agent-name CodeActAgent \
     --dataset swe-bench-test-lite \

diff --git a/evaluation/swe_bench/swe_env_box.py b/evaluation/swe_bench/swe_env_box.py
@@ -1,12 +1,14 @@
 import sys
 import uuid
 
+from datasets import load_dataset
+
 from opendevin.core.config import config
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 from opendevin.runtime.plugins import JupyterRequirement, SWEAgentCommandsRequirement
 
-SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.0'
+SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.1'
 
 
 class SWEBenchSSHBox(DockerSSHBox):
@@ -123,20 +125,15 @@ def get_diff_patch(self):
 
 
 if __name__ == '__main__':
-    EXAMPLE_INSTANCE = {
-        'repo': 'django/django',
-        'instance_id': 'django__django-11099',
-        'base_commit': 'd26b2424437dabeeca94d7900b37d2df4410da0c',
-        'patch': "diff --git a/django/contrib/auth/validators.py b/django/contrib/auth/validators.py\n--- a/django/contrib/auth/validators.py\n+++ b/django/contrib/auth/validators.py\n@@ -7,7 +7,7 @@\n \n @deconstructible\n class ASCIIUsernameValidator(validators.RegexValidator):\n-    regex = r'^[\\w.@+-]+$'\n+    regex = r'^[\\w.@+-]+\\Z'\n     message = _(\n         'Enter a valid username. This value may contain only English letters, '\n         'numbers, and @/./+/-/_ characters.'\n@@ -17,7 +17,7 @@ class ASCIIUsernameValidator(validators.RegexValidator):\n \n @deconstructible\n class UnicodeUsernameValidator(validators.RegexValidator):\n-    regex = r'^[\\w.@+-]+$'\n+    regex = r'^[\\w.@+-]+\\Z'\n     message = _(\n         'Enter a valid username. This value may contain only letters, '\n         'numbers, and @/./+/-/_ characters.'\n",
-        'test_patch': "diff --git a/tests/auth_tests/test_validators.py b/tests/auth_tests/test_validators.py\n--- a/tests/auth_tests/test_validators.py\n+++ b/tests/auth_tests/test_validators.py\n@@ -237,7 +237,7 @@ def test_unicode_validator(self):\n         invalid_usernames = [\n             \"o'connell\", \"عبد ال\",\n             \"zerowidth\\u200Bspace\", \"nonbreaking\\u00A0space\",\n-            \"en\\u2013dash\",\n+            \"en\\u2013dash\", 'trailingnewline\\u000A',\n         ]\n         v = validators.UnicodeUsernameValidator()\n         for valid in valid_usernames:\n@@ -250,7 +250,7 @@ def test_unicode_validator(self):\n \n     def test_ascii_validator(self):\n         valid_usernames = ['glenn', 'GLEnN', 'jean-marc']\n-        invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\"]\n+        invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\", 'trailingnewline\\n']\n         v = validators.ASCIIUsernameValidator()\n         for valid in valid_usernames:\n             with self.subTest(valid=valid):\n",
-        'problem_statement': "UsernameValidator allows trailing newline in usernames\nDescription\n\t\nASCIIUsernameValidator and UnicodeUsernameValidator use the regex \nr'^[\\w.@+-]+$'\nThe intent is to only allow alphanumeric characters as well as ., @, +, and -. However, a little known quirk of Python regexes is that $ will also match a trailing newline. Therefore, the user name validators will accept usernames which end with a newline. You can avoid this behavior by instead using \\A and \\Z to terminate regexes. For example, the validator regex could be changed to\nr'\\A[\\w.@+-]+\\Z'\nin order to reject usernames that end with a newline.\nI am not sure how to officially post a patch, but the required change is trivial - using the regex above in the two validators in contrib.auth.validators.\n",
-        'hints_text': '',
-        'created_at': '2019-03-20T03:46:18Z',
-        'version': '3.0',
-        'FAIL_TO_PASS': '["test_ascii_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_unicode_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_help_text (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)"]',
-        'PASS_TO_PASS': '["test_help_text (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_validate (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_help_text (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_validate_property (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_empty_password_validator_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_get_default_password_validators (auth_tests.test_validators.PasswordValidationTest)", "test_get_password_validators_custom (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed_with_custom_validator (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html_escaping (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_texts (auth_tests.test_validators.PasswordValidationTest)", "test_validate_password (auth_tests.test_validators.PasswordValidationTest)", "test_help_text (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_custom_list (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_django_supplied_file (auth_tests.test_validators.CommonPasswordValidatorTest)"]',
-        'environment_setup_commit': '419a78300f7cd27611196e1e464d50fd0385ff27',
-    }
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+    dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
+    swe_bench_tests = dataset['test'].to_pandas()
+
+    # INSTANCE_ID = 'django__django-11099'
+    INSTANCE_ID = 'astropy__astropy-12907'
+    swe_bench_tests = swe_bench_tests[swe_bench_tests['instance_id'] == INSTANCE_ID]
+    EXAMPLE_INSTANCE = swe_bench_tests.iloc[0].to_dict()
 
     sandbox = SWEBenchSSHBox.get_box_for_instance(instance=EXAMPLE_INSTANCE)
 
@@ -154,9 +151,7 @@ def get_diff_patch(self):
     logger.info(f'git apply $SWE_TASK_DIR/test.patch: {output}')
 
     # TEST
-    exit_code, output = sandbox.execute(
-        './tests/runtests.py --verbosity 2 auth_tests.test_validators'
-    )
+    exit_code, output = sandbox.execute('$TEST_CMD')
     assert exit_code == 1, 'Expected exit code 1 (since this is a FAIL_TO_PASS)'
     logger.info(f'$TEST_CMD:\n{output}')
 
@@ -166,9 +161,7 @@ def get_diff_patch(self):
     logger.info(f'git apply $SWE_TASK_DIR/gold.patch: {output}')
 
     # TEST
-    exit_code, output = sandbox.execute(
-        './tests/runtests.py --verbosity 2 auth_tests.test_validators'
-    )
+    exit_code, output = sandbox.execute('$TEST_CMD')
     assert exit_code == 0, 'Expected exit code 0 (since we applied the gold patch)'
     logger.info(f'$TEST_CMD:\n{output}')