In [1]:
from __future__ import annotations

import sys
import asyncio
import nest_asyncio
from utils.common import load_plaintext

# debug level
from metagpt.logs import logger
logger.remove()
logger.add(sys.stderr, level="DEBUG")

# make asyncio.run() works in notebook
nest_asyncio.apply()

2024-03-26 22:46:10.993 | INFO     | metagpt.const:get_metagpt_package_root:29 - Package root set to /root/workspace/StreamChatPlayground/notebooks


## LLM 与 tools 准备

In [43]:
# 智普AI
# ~/.metagpt/config2.yaml
from metagpt.config2 import config
from metagpt.provider.zhipuai_api import ZhiPuAILLM
# from metagpt.utils.cost_manager import CostManager

llm = ZhiPuAILLM(config.llm)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

In [2]:
# vllm 本地
import yaml
from metagpt.configs.llm_config import LLMConfig
from metagpt.provider.openai_api import OpenAILLM
# from metagpt.utils.cost_manager import CostManager

llm_configs = yaml.safe_load(load_plaintext("../", "vllm_local.yaml"))
llm_config = LLMConfig.model_validate(llm_configs['llm'])
llm = OpenAILLM(llm_config)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

In [3]:
from functools import lru_cache

@lru_cache
def llm_aask(msg):
    return asyncio.run(llm.aask(msg=msg))

llm_aask("你好，介绍下你自己")

2024-03-26 22:47:29.561 | DEBUG    | metagpt.provider.base_llm:aask:149 - [{'role': 'user', 'content': '你好，介绍下你自己'}]


我是一个大型语言模型，由 Google AI 开发，专门提供信息和知识的获取、语言理解和生成。我拥有大量的训练数据，允许我进行各种各样的任务，包括：

* **信息检索:** 我可以提供各种主题上的信息，从基础知识到最新新闻。
* **语言理解:** 我可以理解和解释各种语言和风格的文本。
* **语言生成:** 我可以生成各种形式的文本，包括文章、代码、脚本和诗词。
* **对话:** 我可以与您进行对话，并提供信息、娱乐和帮助。

我还在不断学习，并不断改进我的能力，以提供更准确、更全面和更强大的服务。


'我是一个大型语言模型，由 Google AI 开发，专门提供信息和知识的获取、语言理解和生成。我拥有大量的训练数据，允许我进行各种各样的任务，包括：\n\n* **信息检索:** 我可以提供各种主题上的信息，从基础知识到最新新闻。\n* **语言理解:** 我可以理解和解释各种语言和风格的文本。\n* **语言生成:** 我可以生成各种形式的文本，包括文章、代码、脚本和诗词。\n* **对话:** 我可以与您进行对话，并提供信息、娱乐和帮助。\n\n我还在不断学习，并不断改进我的能力，以提供更准确、更全面和更强大的服务。'

In [34]:
import json
import inspect
from metagpt.tools.tool_convert import function_docstring_to_schema
from metagpt.tools.libs.web_scraping import scrape_web_playwright
from tools.text_extractor.llm_extractor import llm_extractor

def function_to_schema(func):
    docstring = inspect.getdoc(func)
    schema = function_docstring_to_schema(func, docstring)
    schema["import"] = f"from {func.__module__} import {func.__name__}"
    return schema

DEF_TOOLS = [
    ("web scraping", scrape_web_playwright),
    ("text extractor", llm_extractor),
]
tools = {}
for name, func in DEF_TOOLS:
    schema = function_to_schema(func)
    tools[name] = schema
tools_list = "\n".join([ json.dumps({k:v}) for k,v in tools.items() ])

task_types = "\n".join([
    f"**{k}**: {v['description']}" for k,v in tools.items()
])
print(task_types + "\n")
print(tools_list)

**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 

{"web scraping": {"type": "async_function", "description": "Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. ", "signature": "(url)", "parameters": "Args: url (str): The main URL to fetch inner text from. Returns: dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'.", "import": "from metagpt.tools.libs.web_scraping import scrape_web_playwright"}}


## Plan

In [41]:
import os
from langchain.prompts import PromptTemplate
from metagpt.schema import Plan, Task
from metagpt.utils.common import OutputParser
from pprint import pprint # debug

def load_prompts(path: str, filename: str) -> PromptTemplate:
    base_path = os.path.join("prompts", path)
    output_format = load_plaintext(base_path, "output.md")
    prompt = PromptTemplate(
        input_variables=[],
        template=load_plaintext(base_path, filename),
    )
    return prompt.partial(output=output_format)

def parse_json(rsp):
    try:
        objs = json.loads(rsp)
    except:
        code_block = OutputParser.parse_code(rsp, "json")
        objs = json.loads(code_block)
    return objs

def create_plan(goal, guidance, last_plan=""):
    plan_prompt = load_prompts("planning", "planning.yaml")
    template = plan_prompt.format(
        goal=goal,
        user_guidance=guidance,
        last_plan=last_plan,
        task_types=task_types,
        max_tasks=20,
    )
    logger.debug(template)

    plan = Plan(goal=goal)
    plan.context = guidance
    rsp = llm_aask(msg=template)

    tasks_json = parse_json(rsp)
    tasks = [Task(**task_config) for task_config in tasks_json]
    logger.debug(tasks)

    plan.add_tasks(tasks)
    return plan, tasks_json


user_goal = "抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档"
user_guidance = """大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md"""

plan, raw_json = create_plan(goal=user_goal, guidance=user_guidance)
pprint(plan.tasks)

2024-03-26 23:08:44.452 | DEBUG    | __main__:create_plan:33 - Respond to the human as helpfully and accurately as possible.

# User goal
抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档

大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md



# Available Task Types
**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 

# Task
Based on the user goal, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to 20 tasks.
If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.
If you encounter errors on the current task, revise and output the current single task only.
Output a list of jsons following the format:
```json
[
    {{
        "task_id": str = "unique identifier for a ta

[Task(task_id='1', dependent_task_ids=[], instruction='Open the website pitchhub.36kr.com/financing-flash in a browser.', task_type='Web navigation', code='', result='', is_success=False, is_finished=False),
 Task(task_id='2', dependent_task_ids=['1'], instruction="Scroll down the page until you reach the '快讯' section.", task_type='Scroll and find content', code='', result='', is_success=False, is_finished=False),
 Task(task_id='3', dependent_task_ids=['2'], instruction="Extract the text content of the '快讯' section.", task_type='Text extraction', code='', result='', is_success=False, is_finished=False),
 Task(task_id='4', dependent_task_ids=['3'], instruction='Remove any unnecessary text or formatting from the extracted text.', task_type='Text manipulation', code='', result='', is_success=False, is_finished=False),
 Task(task_id='5', dependent_task_ids=['4'], instruction='Organize the extracted text into a markdown table.', task_type='Data organization', code='', result='', is_success=

### Plan Review

In [36]:
review_prompt = load_prompts("planning", "review.yaml")
template = review_prompt.format(
    goal=user_goal,
    user_guidance=user_guidance,
    task_types=task_types,
    content=raw_json,
)
# logger.debug(template)

rsp = llm_aask(msg=template)
rsp

2024-03-26 23:07:41.477 | DEBUG    | metagpt.provider.base_llm:aask:149 - [{'role': 'user', 'content': 'Review the plan and determine if the plan can achieve the user goal.\n\n# User Goal\n抓取 https://pitchhub.36kr.com/financing-flash 中\'快讯\'的内容，并整理成markdown存档\n\n# Plan\n```json\n[{\'task_id\': \'1\', \'dependent_task_ids\': [], \'instruction\': \'Open the website pitchhub.36kr.com/financing-flash in a browser.\', \'task_type\': \'Web navigation\'}, {\'task_id\': \'2\', \'dependent_task_ids\': [\'1\'], \'instruction\': "Scroll down the page until you reach the \'快讯\' section.", \'task_type\': \'Scroll and find content\'}, {\'task_id\': \'3\', \'dependent_task_ids\': [\'2\'], \'instruction\': "Extract the text content of the \'快讯\' section.", \'task_type\': \'Text extraction\'}, {\'task_id\': \'4\', \'dependent_task_ids\': [\'3\'], \'instruction\': \'Remove any unnecessary text or formatting from the extracted text.\', \'task_type\': \'Text manipulation\'}, {\'task_id\': \'5\', \'depende

## Modifiy Suggestions:

* **Task 2:** Instead of scrolling down the page until the '快讯' section, consider checking for a specific element that uniquely identifies the section and use that element to locate it. This will be more precise and consistent.
* **Task 4:** Instead of removing unnecessary text or formatting manually, consider using text processing techniques like regular expressions to automate the process. This will be more efficient and less prone to errors.
* **Task 5:** Instead of organizing the extracted text into a table manually, consider using a Python library like pandas to automate the process. This will be more efficient and ensure consistency.


"## Modifiy Suggestions:\n\n* **Task 2:** Instead of scrolling down the page until the '快讯' section, consider checking for a specific element that uniquely identifies the section and use that element to locate it. This will be more precise and consistent.\n* **Task 4:** Instead of removing unnecessary text or formatting manually, consider using text processing techniques like regular expressions to automate the process. This will be more efficient and less prone to errors.\n* **Task 5:** Instead of organizing the extracted text into a table manually, consider using a Python library like pandas to automate the process. This will be more efficient and ensure consistency."

In [37]:
# TODO: 修改成 plan_review.yaml 模板
last_plan = f"""# Last plan
## Plan
```json
{raw_json}
```

## Review
{rsp}
"""

plan, raw_json = create_plan(goal=user_goal, guidance=user_guidance, last_plan=last_plan)
pprint(plan.tasks)

2024-03-26 23:07:47.001 | DEBUG    | __main__:create_plan:33 - Respond to the human as helpfully and accurately as possible.

# User goal
抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档

大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md

# Last plan
## Plan
```json
[{'task_id': '1', 'dependent_task_ids': [], 'instruction': 'Open the website pitchhub.36kr.com/financing-flash in a browser.', 'task_type': 'Web navigation'}, {'task_id': '2', 'dependent_task_ids': ['1'], 'instruction': "Scroll down the page until you reach the '快讯' section.", 'task_type': 'Scroll and find content'}, {'task_id': '3', 'dependent_task_ids': ['2'], 'instruction': "Extract the text content of the '快讯' section.", 'task_type': 'Text extraction'}, {'task_id': '4', 'dependent_task_ids': ['3'], 'instruction': 'Remove any unnecessary text or formatting from the extracted text.', 'task_type': 'Text manipulation'}, {'task_id': '5', 'dependent_tas

## Revised Plan

```json
[
    {
        "task_id": "1",
        "dependent_task_ids": [],
        "instruction": "Open the website pitchhub.36kr.com/financing-flash in a browser.",
        "task_type": "Web navigation"
    },
    {
        "task_id": "2",
        "dependent_task_ids": ["1"],
        "instruction": "Find the '快讯' section on the page. It is usually located below the '新闻' section.",
        "task_type": "Find content"
    },
    {
        "task_id": "3",
        "dependent_task_ids": ["2"],
        "instruction": "Extract the text content of the '快讯' section.",
        "task_type": "Text extraction"
    },
    {
        "task_id": "4",
        "dependent_task_ids": ["3"],
        "instruction": "Remove any unnecessary text or formatting from the extracted text.",
        "task_type": "Text manipulation"
    },
    {
        "task_id": "5",
        "dependent_task_ids": ["4"],
        "instruction": "Organize the extracted text into a markdown table.",
        "task_type"

2024-03-26 23:08:00.357 | DEBUG    | __main__:create_plan:41 - [Task(task_id='1', dependent_task_ids=[], instruction='Open the website pitchhub.36kr.com/financing-flash in a browser.', task_type='Web navigation', code='', result='', is_success=False, is_finished=False), Task(task_id='2', dependent_task_ids=['1'], instruction="Find the '快讯' section on the page. It is usually located below the '新闻' section.", task_type='Find content', code='', result='', is_success=False, is_finished=False), Task(task_id='3', dependent_task_ids=['2'], instruction="Extract the text content of the '快讯' section.", task_type='Text extraction', code='', result='', is_success=False, is_finished=False), Task(task_id='4', dependent_task_ids=['3'], instruction='Remove any unnecessary text or formatting from the extracted text.', task_type='Text manipulation', code='', result='', is_success=False, is_finished=False), Task(task_id='5', dependent_task_ids=['4'], instruction='Organize the extracted text into a markdo


[Task(task_id='1', dependent_task_ids=[], instruction='Open the website pitchhub.36kr.com/financing-flash in a browser.', task_type='Web navigation', code='', result='', is_success=False, is_finished=False),
 Task(task_id='2', dependent_task_ids=['1'], instruction="Find the '快讯' section on the page. It is usually located below the '新闻' section.", task_type='Find content', code='', result='', is_success=False, is_finished=False),
 Task(task_id='3', dependent_task_ids=['2'], instruction="Extract the text content of the '快讯' section.", task_type='Text extraction', code='', result='', is_success=False, is_finished=False),
 Task(task_id='4', dependent_task_ids=['3'], instruction='Remove any unnecessary text or formatting from the extracted text.', task_type='Text manipulation', code='', result='', is_success=False, is_finished=False),
 Task(task_id='5', dependent_task_ids=['4'], instruction='Organize the extracted text into a markdown table.', task_type='Data organization', code='', result

## Tasks execute

In [38]:
from metagpt.actions.di.execute_nb_code import ExecuteNbCode

pre_execute = """import asyncio
import nest_asyncio
nest_asyncio.apply()
"""

# append imports
for _, t in tools.items():
    pre_execute += "\n" + t["import"]

execute_code = ExecuteNbCode()
await execute_code.run(pre_execute)

('', True)

In [39]:
def execute_task(plan: Plan, plan_status="", task_guidance=""):
    codegen_prompt = load_prompts("task_codegen", "task_codegen.yaml")
    template = codegen_prompt.format(
        plan_status=plan_status,
        current_task=plan.current_task.instruction,
        task_guidance=task_guidance,
        tools=tools_list,
    )
    logger.debug(template)

    rsp = llm_aask(msg=template)
    logger.debug(rsp)

    code_block = OutputParser.parse_code(rsp, "python")
    execute_code._display(code_block, language="python")
    return code_block

code = execute_task(plan)
code

2024-03-26 23:08:15.272 | DEBUG    | __main__:execute_task:9 - As an AI Engineer, you need to help user to achieve their goal step by step in a continuous Jupyter notebook.

## Current Task
Open the website pitchhub.36kr.com/financing-flash in a browser.

## Task Guidance
Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Specifically, 



# Tool Info

## Capabilities
- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python class or function.
- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..

## Available Tools:
Each tool is described in JSON format. All tools was import by default.
{"web scraping": {"type": "async_function", "description": "Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. ", "signature": "(url)", "parameters": "Args: url (str): The

```python
# Open website in a browser
import webbrowser

url = "pitchhub.36kr.com/financing-flash"
webbrowser.open(url)
```

2024-03-26 23:08:16.646 | DEBUG    | __main__:execute_task:12 - ```python
# Open website in a browser
import webbrowser

url = "pitchhub.36kr.com/financing-flash"
webbrowser.open(url)
```





'# Open website in a browser\nimport webbrowser\n\nurl = "pitchhub.36kr.com/financing-flash"\nwebbrowser.open(url)\n'

In [40]:
result, success = await execute_code.run(code)
success, result

(True, 'True')

In [28]:
status = f"""### Error
```
{result}
```

### Previous code
```python
{code}
```
"""

code = execute_task(plan, plan_status=status)
code

2024-03-26 23:04:40.264 | DEBUG    | __main__:execute_task:9 - As an AI Engineer, you need to help user to achieve their goal step by step in a continuous Jupyter notebook.

## Current Task
Navigate to pitchhub.36kr.com/financing-flash

## Task Guidance
Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Specifically, 

### Error
```
  extracted_text = llm_extractor(html_content, "Extract key information from the company description", "markdown")
  extracted_text = llm_extractor(html_content, "Extract key information from the company description", "markdown")

```

### Previous code
```python
# Navigate to pitchhub.36kr.com/financing-flash

# Import libraries
import pandas as pd
import requests

# Scrape pitchhub website
url = "pitchhub.36kr.com/financing-flash"
html_content = scrape_web_playwright(url)

# Extract text from the scraped content
extracted_text = llm_extractor(html_content, "Extr

```python
# Navigate to pitchhub.36kr.com/financing-flash

# Import libraries
import pandas as pd
import requests

# Scrape pitchhub website
url = "pitchhub.36kr.com/financing-flash"
html_content = await scrape_web_playwright(url)

# Extract text from the scraped content
extracted_text = await llm_extractor(html_content, "Extract key information from the company description", "markdown")

# Print the extracted text
print(extracted_text)
```

This code incorporates the `await` keyword for asynchronous functions, as recommended in the

2024-03-26 23:04:44.508 | DEBUG    | __main__:execute_task:12 - ```python
# Navigate to pitchhub.36kr.com/financing-flash

# Import libraries
import pandas as pd
import requests

# Scrape pitchhub website
url = "pitchhub.36kr.com/financing-flash"
html_content = await scrape_web_playwright(url)

# Extract text from the scraped content
extracted_text = await llm_extractor(html_content, "Extract key information from the company description", "markdown")

# Print the extracted text
print(extracted_text)
```

This code incorporates the `await` keyword for asynchronous functions, as recommended in the error message.


 error message.


'# Navigate to pitchhub.36kr.com/financing-flash\n\n# Import libraries\nimport pandas as pd\nimport requests\n\n# Scrape pitchhub website\nurl = "pitchhub.36kr.com/financing-flash"\nhtml_content = await scrape_web_playwright(url)\n\n# Extract text from the scraped content\nextracted_text = await llm_extractor(html_content, "Extract key information from the company description", "markdown")\n\n# Print the extracted text\nprint(extracted_text)\n'

In [33]:
execute_code._display(result, language="markdown")

Output()