In [10]:
from __future__ import annotations

import sys
import asyncio
import nest_asyncio
from utils.common import load_plaintext

# debug level
from metagpt.logs import logger
logger.remove()
logger.add(sys.stderr, level="DEBUG")

# make asyncio.run() works in notebook
nest_asyncio.apply()

## LLM 与 tools 准备

In [2]:
# 智普AI
# ~/.metagpt/config2.yaml
from metagpt.config2 import config
from metagpt.provider.zhipuai_api import ZhiPuAILLM
# from metagpt.utils.cost_manager import CostManager

llm = ZhiPuAILLM(config.llm)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

In [14]:
# vllm 本地
import yaml
from metagpt.configs.llm_config import LLMConfig
from metagpt.provider.openai_api import OpenAILLM
# from metagpt.utils.cost_manager import CostManager

llm_configs = yaml.safe_load(load_plaintext("../", "vllm_local.yaml"))
llm_config = LLMConfig.model_validate(llm_configs['llm'])
llm = OpenAILLM(llm_config)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

models = await llm.aclient.models.list()
for idx, mod in enumerate(models.data):
    print(f"{idx+1}. {mod.id} ({mod.owned_by})")

1. gemma-7b-it (vllm)


In [15]:
from functools import lru_cache

@lru_cache
def llm_aask(msg, seed=None):
    return asyncio.run(llm.aask(msg=msg))

llm_aask("你好，介绍下你自己")

2024-04-08 22:55:46.108 | DEBUG    | metagpt.provider.base_llm:aask:149 - [{'role': 'user', 'content': '你好，介绍下你自己'}]


我是一个大型语言模型，由 Google AI 开发。我拥有大量的文字和代码，可以提供各种各样的信息和服务。我能够进行各种任务，包括：

* **信息检索:** 我可以提供各种主题上的信息，从基础知识到最新新闻。
* **代码编写:** 我可以帮助你编写代码，包括各种编程语言。
* **语言翻译:** 我可以翻译多种语言。
* **对话:** 我可以与你进行对话，并提供娱乐和信息。

我还在不断学习和进步，以提供更加准确和强大的服务。


'我是一个大型语言模型，由 Google AI 开发。我拥有大量的文字和代码，可以提供各种各样的信息和服务。我能够进行各种任务，包括：\n\n* **信息检索:** 我可以提供各种主题上的信息，从基础知识到最新新闻。\n* **代码编写:** 我可以帮助你编写代码，包括各种编程语言。\n* **语言翻译:** 我可以翻译多种语言。\n* **对话:** 我可以与你进行对话，并提供娱乐和信息。\n\n我还在不断学习和进步，以提供更加准确和强大的服务。'

In [17]:
import json
import inspect
from metagpt.tools.tool_convert import function_docstring_to_schema
from metagpt.tools.libs.web_scraping import scrape_web_playwright
from tools.text_extractor.llm_extractor import llm_extractor

def function_to_schema(func):
    docstring = inspect.getdoc(func)
    schema = function_docstring_to_schema(func, docstring)
    schema["import"] = f"from {func.__module__} import {func.__name__}"
    return schema

DEF_TOOLS = [
    ("web scraping", scrape_web_playwright),
    ("text extractor", llm_extractor),
]
tools = {}
for name, func in DEF_TOOLS:
    schema = function_to_schema(func)
    tools[name] = schema
tools_list = "\n".join([ json.dumps({k:v}) for k,v in tools.items() ])

task_types = "\n".join([
    f"**{k}**: {v['description']}" for k,v in tools.items()
])
print(task_types + "\n")
print(tools_list)

**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 
**text extractor**: Perform extraction on the 'content' text using a large language model. 

{"web scraping": {"type": "async_function", "description": "Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. ", "signature": "(url)", "parameters": "Args: url (str): The main URL to fetch inner text from. Returns: dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'.", "import": "from metagpt.tools.libs.web_scraping import scrape_web_playwright"}}
{"text extractor": {"type": "async_function", "description": "Perform extraction on the 'content' text using a large language model. ", "signature": "(guidance: str, content: str, format: str) -> str", "parameters": "Args: guidance (str): Guide the extraction process. content (str): The text content that needs to be extracted. format

## Plan

In [18]:
import os
from langchain.prompts import PromptTemplate
from metagpt.schema import Plan, Task
from metagpt.utils.common import OutputParser
from pprint import pprint # debug

def load_prompts(path: str, filename: str) -> PromptTemplate:
    base_path = os.path.join("prompts", path)
    output_format = load_plaintext(base_path, "output.md")
    prompt = PromptTemplate(
        input_variables=[],
        template=load_plaintext(base_path, filename),
    )
    return prompt.partial(output=output_format)

def parse_json(rsp):
    try:
        objs = json.loads(rsp)
    except:
        code_block = OutputParser.parse_code(rsp, "json")
        objs = json.loads(code_block)
    return objs

def create_plan(goal, guidance, last_plan=""):
    plan_prompt = load_prompts("planning", "planning.yaml")
    template = plan_prompt.format(
        goal=goal,
        user_guidance=guidance,
        last_plan=last_plan,
        task_types=task_types,
        max_tasks=20,
    )
    logger.debug(template)

    plan = Plan(goal=goal)
    plan.context = guidance
    rsp = llm_aask(msg=template)

    tasks_json = parse_json(rsp)
    tasks = [Task(**task_config) for task_config in tasks_json]
    logger.debug(tasks)

    plan.add_tasks(tasks)
    return plan, tasks_json


user_goal = "抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档"
user_guidance = """# 参考流程
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md"""

plan, raw_json = create_plan(goal=user_goal, guidance=user_guidance)
pprint(plan.tasks)

2024-04-08 23:02:02.036 | DEBUG    | __main__:create_plan:33 - Respond to the human as helpfully and accurately as possible.

# User goal
抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档

# 参考流程
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md



# Available Task Types
**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 
**text extractor**: Perform extraction on the 'content' text using a large language model. 

# Task
Based on the user goal, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to 20 tasks.
If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.
If you encounter errors on the current task, revise and output the current single task only.
Output a list of jsons

## Plan

```json
[
    {
        "task_id": "1",
        "dependent_task_ids": [],
        "instruction": "Navigate to the website: pitchhub.36kr.com/financing-flash",
        "task_type": "web scraping"
    },
    {
        "task_id": "2",
        "dependent_task_ids": ["1"],
        "instruction": "Extract the text content of the webpage",
        "task_type": "web scraping"
    },
    {
        "task_id": "3",
        "dependent_task_ids": ["2"],
        "instruction": "Find the text content containing the word '快讯' and extract it",
        "task_type": "text extractor"
    },
    {
        "task_id": "4",
        "dependent_task_ids": ["3"],
        "instruction": "Save the extracted text content into a markdown file named '快讯.md'",
        "task_type": "text file creation"
    }
]
```

**Notes:**

- This plan includes four tasks: navigating to the website, extracting the text content of the webpage, finding the text content containing the word '快讯' and saving the extracted text co

2024-04-08 23:02:12.634 | DEBUG    | __main__:create_plan:41 - [Task(task_id='1', dependent_task_ids=[], instruction='Navigate to the website: pitchhub.36kr.com/financing-flash', task_type='web scraping', code='', result='', is_success=False, is_finished=False), Task(task_id='2', dependent_task_ids=['1'], instruction='Extract the text content of the webpage', task_type='web scraping', code='', result='', is_success=False, is_finished=False), Task(task_id='3', dependent_task_ids=['2'], instruction="Find the text content containing the word '快讯' and extract it", task_type='text extractor', code='', result='', is_success=False, is_finished=False), Task(task_id='4', dependent_task_ids=['3'], instruction="Save the extracted text content into a markdown file named '快讯.md'", task_type='text file creation', code='', result='', is_success=False, is_finished=False)]


 installed to complete the tasks.
[Task(task_id='1', dependent_task_ids=[], instruction='Navigate to the website: pitchhub.36kr.com/financing-flash', task_type='web scraping', code='', result='', is_success=False, is_finished=False),
 Task(task_id='2', dependent_task_ids=['1'], instruction='Extract the text content of the webpage', task_type='web scraping', code='', result='', is_success=False, is_finished=False),
 Task(task_id='3', dependent_task_ids=['2'], instruction="Find the text content containing the word '快讯' and extract it", task_type='text extractor', code='', result='', is_success=False, is_finished=False),
 Task(task_id='4', dependent_task_ids=['3'], instruction="Save the extracted text content into a markdown file named '快讯.md'", task_type='text file creation', code='', result='', is_success=False, is_finished=False)]


### Plan Review

In [23]:
review_prompt = load_prompts("planning", "review.yaml")
template = review_prompt.format(
    goal=user_goal,
    user_guidance=user_guidance,
    task_types=task_types,
    content=raw_json,
)
# logger.debug(template)

rsp = llm_aask(msg=template)
rsp

2024-04-08 21:48:46.003 | DEBUG    | metagpt.provider.base_llm:aask:149 - [{'role': 'user', 'content': 'Review the plan and determine if the plan can achieve the user goal.\n\n# User Goal\n抓取 https://pitchhub.36kr.com/financing-flash 中\'快讯\'的内容，并整理成markdown存档\n\n# Plan\n```json\n[{\'task_id\': \'1\', \'dependent_task_ids\': [], \'instruction\': \'Choose a programming language and set up the required tools for web scraping and text extraction.\', \'task_type\': \'setup\'}, {\'task_id\': \'2\', \'dependent_task_ids\': [\'1\'], \'instruction\': \'Clone the repository or create a new project directory for the web scraping and text extraction tasks.\', \'task_type\': \'file_management\'}, {\'task_id\': \'3\', \'dependent_task_ids\': [\'2\'], \'instruction\': \'Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash).\', \'task_type\': \'web_scraping\'}, {\'task_id\': \'4\', \'dependent_task_ids\': [\

The plan seems to be properly set up for achieving the user goal. The tasks are well-defined, and each step logically follows the previous one. The use of the provided task types also makes it easier to understand the purpose of each task. The plan format meets the output requirements.

However, there's one suggestion to make the plan more efficient:

Instruction for task 4 could be combined with task 3 to save time and resources:

```json
- Replace task 4 with:
  {"task_id": "3", "dependent_task_ids": ["2"], "instruction": "Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash), and parse the obtained HTML structure to extract the content within the 'quick-news' section.", "task_type": "web_scraping, html_parsing"}
```

This way, the scraping and parsing will be done in a single task, reducing the need to repeat the web scraping process.


'The plan seems to be properly set up for achieving the user goal. The tasks are well-defined, and each step logically follows the previous one. The use of the provided task types also makes it easier to understand the purpose of each task. The plan format meets the output requirements.\n\nHowever, there\'s one suggestion to make the plan more efficient:\n\nInstruction for task 4 could be combined with task 3 to save time and resources:\n\n```json\n- Replace task 4 with:\n  {"task_id": "3", "dependent_task_ids": ["2"], "instruction": "Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash), and parse the obtained HTML structure to extract the content within the \'quick-news\' section.", "task_type": "web_scraping, html_parsing"}\n```\n\nThis way, the scraping and parsing will be done in a single task, reducing the need to repeat the web scraping process.'

In [24]:
# TODO: 修改成 plan_review.yaml 模板
last_plan = f"""# Last plan
## Plan
```json
{raw_json}
```

## Review
{rsp}
"""

plan, raw_json = create_plan(goal=user_goal, guidance=user_guidance, last_plan=last_plan)
pprint(plan.tasks)

2024-04-08 21:48:52.022 | DEBUG    | __main__:create_plan:33 - Respond to the human as helpfully and accurately as possible.

# User goal
抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档

大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md

# Last plan
## Plan
```json
[{'task_id': '1', 'dependent_task_ids': [], 'instruction': 'Choose a programming language and set up the required tools for web scraping and text extraction.', 'task_type': 'setup'}, {'task_id': '2', 'dependent_task_ids': ['1'], 'instruction': 'Clone the repository or create a new project directory for the web scraping and text extraction tasks.', 'task_type': 'file_management'}, {'task_id': '3', 'dependent_task_ids': ['2'], 'instruction': 'Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash).', 'task_type': 'web_scraping'}, {'task_id': '4', 'dependent_task

Given the user goal and the suggestion to combine the parsing step with the web scraping step, the updated plan is as follows:

```json
[{'task_id': '1', 'dependent_task_ids': [], 'instruction': 'Choose a programming language and set up the required tools for web scraping and text extraction.', 'task_type': 'setup'}, {'task_id': '2', 'dependent_task_ids': ['1'], 'instruction': 'Clone the repository or create a new project directory for the web scraping and text extraction tasks.', 'task_type': 'file_management'}, {'task_id': '3', 'dependent_task_ids': ['2'], 'instruction': 'Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash) and parse the obtained HTML structure to extract the content within the \'quick-news\' section.', 'task_type': 'web_scraping, html_parsing'}, {'task_id': '4', 'dependent_task_ids': ['3'], 'instruction': "Use a large language model to extract the '快讯' content from the pa

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)

## Tasks execute

In [19]:
from metagpt.actions.di.execute_nb_code import ExecuteNbCode

pre_execute = """import asyncio
import nest_asyncio
nest_asyncio.apply()
"""

# append imports
for _, t in tools.items():
    pre_execute += "\n" + t["import"]

execute_code = ExecuteNbCode()
result, success = await execute_code.run(pre_execute)

In [27]:
def execute_task(plan: Plan, plan_status="", task_guidance=""):
    codegen_prompt = load_prompts("task_codegen", "task_codegen.yaml")
    template = codegen_prompt.format(
        plan_status=plan_status,
        current_task=plan.current_task.instruction,
        task_guidance=task_guidance,
        tools=tools_list,
    )
    logger.debug(template)

    rsp = llm_aask(msg=template, seed=123)
    logger.debug(rsp)

    code_block = OutputParser.parse_code(rsp, "python")
    execute_code._display(code_block, language="python")
    return code_block

code = execute_task(plan, task_guidance="请只尽可能用提供的工具来简化你的工作，并用print打印当前任务的结果")
code

2024-04-08 23:16:37.981 | DEBUG    | __main__:execute_task:9 - As an AI Engineer, you need to help user to achieve their goal step by step in a continuous Jupyter notebook.

## Current Task
Navigate to the website: pitchhub.36kr.com/financing-flash

## Task Guidance
Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Specifically, 请只尽可能用提供的工具来简化你的工作，并用print打印当前任务的结果



# Tool Info

## Capabilities
- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python class or function.
- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..

## Available Tools:
Each tool is described in JSON format. All tools was import by default.
{"web scraping": {"type": "async_function", "description": "Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. ", "signature": "(url)", "param

```python
import requests
import json

# Navigate to pitchhub website
url = "pitchhub.36kr.com/financing-flash"
requests.get(url)

# Extract the HTML content of the website
html_content = requests.get(url).text

# Parse the HTML content and extract the inner text
inner_text = extract_inner_text(html_content)

# Print the extracted inner text
print(inner_text)
```

This code utilizes the `scrape_web_playwright` tool to scrape the website and the `extract_inner_text` tool to extract the inner text

2024-04-08 23:16:42.274 | DEBUG    | __main__:execute_task:12 - ```python
import requests
import json

# Navigate to pitchhub website
url = "pitchhub.36kr.com/financing-flash"
requests.get(url)

# Extract the HTML content of the website
html_content = requests.get(url).text

# Parse the HTML content and extract the inner text
inner_text = extract_inner_text(html_content)

# Print the extracted inner text
print(inner_text)
```

This code utilizes the `scrape_web_playwright` tool to scrape the website and the `extract_inner_text` tool to extract the inner text from the HTML content.


 from the HTML content.


'import requests\nimport json\n\n# Navigate to pitchhub website\nurl = "pitchhub.36kr.com/financing-flash"\nrequests.get(url)\n\n# Extract the HTML content of the website\nhtml_content = requests.get(url).text\n\n# Parse the HTML content and extract the inner text\ninner_text = extract_inner_text(html_content)\n\n# Print the extracted inner text\nprint(inner_text)\n'

In [24]:
result, success = await execute_code.run(code)
success, result

(False,

In [25]:
status = f"""### Error
```
{result}
```

### Previous code
```python
{code}
```
"""

code = execute_task(plan, plan_status=status)
code

2024-04-08 23:10:18.010 | DEBUG    | __main__:execute_task:9 - As an AI Engineer, you need to help user to achieve their goal step by step in a continuous Jupyter notebook.

## Current Task
Navigate to the website: pitchhub.36kr.com/financing-flash

## Task Guidance
Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Specifically, 

### Error
```
  scrape_web_playwright(url)
  extracted_text = llm_extractor("Guide me on extracting text from this website", scrape_web_playwright(url)["inner_text"], "json")
,---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[2], line 14
     11 scrape_web_playwright(url)
     13 # Extract text from the website
---> 14 extracted_text = llm_extractor("Guide me on extracting text from this website", scrape_web_playwright(url)["inner_text"], "json")
     16 # 

```python
# Navigate to pitchhub.36kr.com/financing-flash

import asyncio
import webbrowser

# Open the website
url = "pitchhub.36kr.com/financing-flash"
webbrowser.open(url)

# Scrape the website using Playwright
await scrape_web_playwright(url)

# Extract text from the website
extracted_text = llm_extractor("Guide me on extracting text from this website", scrape_web_playwright(url)["inner_text"], "json")

# Print the extracted text
print(

2024-04-08 23:10:22.223 | DEBUG    | __main__:execute_task:12 - ```python
# Navigate to pitchhub.36kr.com/financing-flash

import asyncio
import webbrowser

# Open the website
url = "pitchhub.36kr.com/financing-flash"
webbrowser.open(url)

# Scrape the website using Playwright
await scrape_web_playwright(url)

# Extract text from the website
extracted_text = llm_extractor("Guide me on extracting text from this website", scrape_web_playwright(url)["inner_text"], "json")

# Print the extracted text
print(extracted_text)
```


extracted_text)
```


'# Navigate to pitchhub.36kr.com/financing-flash\n\nimport asyncio\nimport webbrowser\n\n# Open the website\nurl = "pitchhub.36kr.com/financing-flash"\nwebbrowser.open(url)\n\n# Scrape the website using Playwright\nawait scrape_web_playwright(url)\n\n# Extract text from the website\nextracted_text = llm_extractor("Guide me on extracting text from this website", scrape_web_playwright(url)["inner_text"], "json")\n\n# Print the extracted text\nprint(extracted_text)\n'

In [10]:
execute_code._display(result, language="markdown")

Output()