Implement agentskills for OpenDevin to helpfully improve edit AND i…

…ncluding more useful tools/skills (All-Hands-AI#1941) * add draft for skills * Implement and test agentskills functions: open_file, goto_line, scroll_down, scroll_up, create_file, search_dir, search_file, find_file * Remove new_sample.txt file * add some work from opendevin w/ fixes * Add unit tests for agentskills module * fix some issues and updated tests * add more tests for open * tweak and handle goto_line * add tests for some edge cases * add tests for scrolling * add tests for edit * add tests for search_dir * update tests to use pytest * use pytest --forked to avoid file op unit tests to interfere with each other via global var * update doc based on swe agent tool * update and add tests for find_file and search_file * move agent_skills to plugins * add agentskills as plugin and docs * add agentskill to ssh box and fix sandbox integration * remove extra returns in doc * add agentskills to initial tool for jupyter * support re-init jupyter kernel (for agentskills) after restart * fix print window's issue with indentation and add testcases * add prompt for codeact with the newest edit primitives * modify the way line number is presented (remove leading space) * change prompt to the newest display format * support tracking of costs via metrics * Update opendevin/runtime/plugins/agent_skills/README.md * Update opendevin/runtime/plugins/agent_skills/README.md * implement and add tests for py linting * remove extra text arg for incompatible subprocess ver * remove sample.txt * update test_edits integration tests * fix all integration * Update opendevin/runtime/plugins/agent_skills/README.md * Update opendevin/runtime/plugins/agent_skills/README.md * Update opendevin/runtime/plugins/agent_skills/README.md * Update agenthub/codeact_agent/prompt.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update agenthub/codeact_agent/prompt.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update agenthub/codeact_agent/prompt.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update opendevin/runtime/plugins/agent_skills/agentskills.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * correctly setup plugins for swebench eval * bump swe-bench version and add logging * correctly setup plugins for swebench eval * bump swe-bench version and add logging * Revert "correctly setup plugins for swebench eval" This reverts commit 2bd1055. * bump version * remove _AGENT_SKILLS_DOCS * move flake8 to test dep * update poetry.lock * remove extra arg * reduce max iter for eval * update poetry * fix integration tests --------- Co-authored-by: OpenDevin <opendevin@opendevin.ai> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
super-dainiu · May 23, 2024 · 31f2c4a · 31f2c4a
1 parent 61d7e9a
commit 31f2c4a
Show file tree

Hide file tree

Showing 39 changed files with 2,740 additions and 1,264 deletions.
diff --git a/.github/workflows/run-unit-tests.yml b/.github/workflows/run-unit-tests.yml
@@ -59,7 +59,7 @@ jobs:
         run: make build
 
       - name: Run Tests
-        run: poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
+        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
@@ -93,7 +93,7 @@ jobs:
         run: make build
 
       - name: Run Tests
-        run: poetry run pytest --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
+        run: poetry run pytest --forked --cov=agenthub --cov=opendevin --cov-report=xml ./tests/unit -k "not test_sandbox"
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4

diff --git a/agenthub/codeact_agent/codeact_agent.py b/agenthub/codeact_agent/codeact_agent.py
@@ -24,9 +24,9 @@
 )
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
+    AgentSkillsRequirement,
     JupyterRequirement,
     PluginRequirement,
-    SWEAgentCommandsRequirement,
 )
 
 ENABLE_GITHUB = True
@@ -106,7 +106,7 @@ def truncate_observation(observation: str, max_chars: int = 10_000) -> str:
 
 
 class CodeActAgent(Agent):
-    VERSION = '1.4'
+    VERSION = '1.5'
     """
     The Code Act Agent is a minimalist agent.
     The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -144,9 +144,13 @@ class CodeActAgent(Agent):
     """
 
     sandbox_plugins: list[PluginRequirement] = [
+        # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
+        # AgentSkillsRequirement provides a lot of Python functions
+        # and it need to be initialized before Jupyter for Jupyter to use those functions.
+        AgentSkillsRequirement(),
         JupyterRequirement(),
-        SWEAgentCommandsRequirement(),
     ]
+    jupyter_kernel_init_code: str = 'from agentskills import *'
 
     system_message: str = (
         f'{SYSTEM_PREFIX}\n{GITHUB_MESSAGE}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
@@ -248,7 +252,11 @@ def step(self, state: State) -> Action:
             # a code block was found
             code_group = python_code.group(1).strip()
             thought = action_str.replace(python_code.group(0), '').strip()
-            return IPythonRunCellAction(code=code_group, thought=thought)
+            return IPythonRunCellAction(
+                code=code_group,
+                thought=thought,
+                kernel_init_code=self.jupyter_kernel_init_code,
+            )
         elif browse_command := re.search(
             r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
         ):

diff --git a/agenthub/codeact_agent/prompt.py b/agenthub/codeact_agent/prompt.py
@@ -1,30 +1,11 @@
-from opendevin.runtime.plugins import SWEAgentCommandsRequirement
+from opendevin.runtime.plugins import AgentSkillsRequirement
 
-_SWEAGENT_BASH_DOCS = '\n'.join(
-    filter(
-        lambda x: not x.startswith('submit'),
-        SWEAgentCommandsRequirement.documentation.split('\n'),
-    )
-)
-# _SWEAGENT_BASH_DOCS content below:
-"""
-open <path> [<line_number>] - opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line
-goto <line_number> - moves the window to show <line_number>
-scroll_down - moves the window down {WINDOW} lines
-scroll_up - moves the window down {WINDOW} lines
-create <filename> - creates and opens a new file with the given name
-search_dir <search_term> [<dir>] - searches for search_term in all files in dir. If dir is not provided, searches in the current directory
-search_file <search_term> [<file>] - searches for search_term in file. If file is not provided, searches in the current open file
-find_file <file_name> [<dir>] - finds all files with the given name in dir. If dir is not provided, searches in the current directory
-edit <start_line>:<end_line> <<EOF
-<replacement_text>
-EOF - replaces lines <start_line> through <end_line> (inclusive) with the given text in the open file. The replacement text is delineated using heredoc syntax. All of the <replacement text> will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again. Remember, the file must be open before editing.
-"""
+_AGENT_SKILLS_DOCS = AgentSkillsRequirement.documentation
 
 COMMAND_DOCS = (
-    '\nApart from the standard bash commands, you can also use the following special commands in <execute_bash> environment:\n'
-    f'{_SWEAGENT_BASH_DOCS}'
-    "Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
+    '\nApart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:\n'
+    f'{_AGENT_SKILLS_DOCS}'
+    "Please note that THE `edit_file` FUNCTION REQUIRES PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run."
 )
 
 SYSTEM_PREFIX = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
@@ -39,13 +20,13 @@
 The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
 The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them."""
 
-GITHUB_MESSAGE = """To do any activities on GitHub, you should use the token in the $GITHUB_TOKEN environment variable.
-For instance, to push a local branch `my_branch` to the github repo `owner/repo`, you can use the following four commands:
+GITHUB_MESSAGE = """To do any activities on GitHub, the assistant should use the token in the $GITHUB_TOKEN environment variable.
+For instance, to push a local branch `my_branch` to the github repo `owner/repo`, the assistant can use the following four commands:
 <execute_bash> git push https://$GITHUB_TOKEN@github.com/owner/repo.git my_branch </execute_bash>
-If you require access to GitHub but $GITHUB_TOKEN is not set, ask the user to set it for you."""
+If the assistant require access to GitHub but $GITHUB_TOKEN is not set, ask the user to set it."""
 
 SYSTEM_SUFFIX = """The assistant's response should be concise.
-You should include ONLY ONE <execute_ipython> or <execute_bash> or <execute_browse> in every one of your responses, unless you are finished with the task or need more input or action from the user in order to proceed.
+The assistant should include ONLY ONE <execute_ipython> or <execute_bash> or <execute_browse> in every one of the responses, unless the assistant is finished with the task or need more input or action from the user in order to proceed.
 IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
 """
 
@@ -55,10 +36,21 @@
 USER: Can you create a list of numbers from 1 to 10, and create a web page to display them at port 5000?
 
 ASSISTANT:
-Sure! Let me write the Python code for starting a web server and save it to a file `app.py`:
+Sure! Let me create a file first:
 <execute_ipython>
-CODE='''
-from flask import Flask
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+1|
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE=\"\"\"from flask import Flask
 app = Flask(__name__)
 
 @app.route('/')
@@ -67,15 +59,23 @@ def index():
     return str(numbers)
 
 if __name__ == '__main__':
-    app.run(port=5000)
-'''
-with open('app.py', 'w') as f:
-    f.write(CODE)
+    app.run(port=5000)\"\"\"
+edit_file(start=1, end=1, content=EDITED_CODE)
 </execute_ipython>
 
 USER:
-Observation:
-[Code executed successfully with no output]
+OBSERVATION:
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
 
 ASSISTANT:
 I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
@@ -84,7 +84,7 @@ def index():
 </execute_bash>
 
 USER:
-Observation:
+OBSERVATION:
 [1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
 
 ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
@@ -93,7 +93,7 @@ def index():
 </execute_bash>
 
 USER:
-Observation:
+OBSERVATION:
 Traceback (most recent call last):
   File "/workspace/app.py", line 2, in <module>
     from flask import Flask
@@ -106,7 +106,7 @@ def index():
 </execute_bash>
 
 USER:
-Observation:
+OBSERVATION:
 Defaulting to user installation because normal site-packages is not writeable
 Collecting flask
   Using cached flask-3.0.3-py3-none-any.whl (101 kB)
@@ -130,7 +130,7 @@ def index():
 </execute_bash>
 
 USER:
-Observation:
+OBSERVATION:
 [1] 124
 
 ASSISTANT:
@@ -140,7 +140,7 @@ def index():
 </execute_bash>
 
 USER:
-Observation:
+OBSERVATION:
 * Serving Flask app 'app'
  * Debug mode: off
 WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
@@ -159,7 +159,7 @@ def index():
 </execute_browse>
 
 USER:
-Observation:
+OBSERVATION:
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 
 ASSISTANT:
@@ -174,52 +174,48 @@ def index():
 </execute_bash>
 
 USER:
-Observation:
+OBSERVATION:
 [1]+  Terminated              python3 app.py > server.log 2>&1
 
 ASSISTANT:
 The server has been stopped. Let me open the Python file and modify it.
-<execute_bash>
-open app.py
-</execute_bash>
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
 
 USER:
-[File: /workspace/app.py (11 lines total)]
-1:
-2:from flask import Flask
-3:app = Flask(__name__)
-4:
-5:@app.route('/')
-6:def index():
-7:    numbers = list(range(1, 11))
-8:    return str(numbers)
-9:
-10:if __name__ == '__main__':
-11:    app.run(port=5000)
+[File: /workspace/app.py (10 lines total)]
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
 
 ASSISTANT:
 I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
-<execute_bash>
-edit 8:8 <<EOF
-    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
-EOF
-</execute_bash>
+<execute_ipython>
+edit_file(start=7, end=7, content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'")
+</execute_ipython>
 
 USER:
 Observation:
-[File: /workspace/app.py (11 lines total)]
-1:
-2:from flask import Flask
-3:app = Flask(__name__)
-4:
-5:@app.route('/')
-6:def index():
-7:    numbers = list(range(1, 11))
-8:    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
-9:
-10:if __name__ == '__main__':
-11:    app.run(port=5000)
-File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.
+[File: /workspace/app.py (10 lines total after edit)]
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+[File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
 
 ASSISTANT:
 The file has been updated. Let me run the Python file again with the new changes:

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
@@ -14,6 +14,7 @@
 from datasets import load_dataset
 from tqdm import tqdm
 
+import agenthub
 from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox
 from opendevin.controller.state.state import State
 from opendevin.core.config import args, config, get_llm_config_arg
@@ -185,11 +186,11 @@ def get_test_result(instance, sandbox, workspace_dir_name):
 
 
 def process_instance(
-    instance,
-    agent_class,
-    metadata,
-    skip_workspace_mount,
-    eval_output_dir,
+    instance: dict,
+    agent_class: str,
+    metadata: dict,
+    skip_workspace_mount: bool,
+    eval_output_dir: str,
     reset_logger: bool = True,
 ):
     workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
@@ -236,6 +237,7 @@ def process_instance(
         workspace_dir_name,
         skip_workspace_mount=skip_workspace_mount,
         workspace_mount_path=workspace_mount_path,
+        sandbox_plugins=agenthub.Agent.get_cls(agent_class).sandbox_plugins,
     )
 
     # Prepare instruction
@@ -280,6 +282,8 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
+    metrics = state.metrics.get() if state.metrics else None
+
     # Save the output
     output = {
         'instance_id': instance.instance_id,
@@ -290,6 +294,7 @@ def process_instance(
         'history': [
             (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
         ],
+        'metrics': metrics,
         'error': state.error if state and state.error else None,
         'test_result': test_result,
     }

diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh
@@ -19,7 +19,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
-  --max-iterations 50 \
+  --max-iterations 30 \
   --max-chars 10000000 \
   --eval-num-workers 8 \
   --eval-note $AGENT_VERSION"