In [1]:
from __future__ import annotations

import sys
import asyncio
import nest_asyncio
from utils.common import load_plaintext

# debug level
from metagpt.logs import logger
logger.remove()
logger.add(sys.stderr, level="DEBUG")

# make asyncio.run() works in notebook
nest_asyncio.apply()

2024-03-23 09:05:19.741 | INFO     | metagpt.const:get_metagpt_package_root:29 - Package root set to /root/workspace/StreamChatPlayground/notebooks


## LLM 与 tools 准备

In [None]:
# 智普AI
# ~/.metagpt/config2.yaml
from metagpt.config2 import config
from metagpt.provider.zhipuai_api import ZhiPuAILLM
# from metagpt.utils.cost_manager import CostManager

llm = ZhiPuAILLM(config.llm)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

In [2]:
# vllm 本地
import yaml
from metagpt.configs.llm_config import LLMConfig
from metagpt.provider.openai_api import OpenAILLM
# from metagpt.utils.cost_manager import CostManager

llm_configs = yaml.safe_load(load_plaintext("../", "vllm_local.yaml"))
llm_config = LLMConfig.model_validate(llm_configs['llm'])
llm = OpenAILLM(llm_config)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

In [3]:
from functools import lru_cache

@lru_cache
def llm_aask(msg):
    return asyncio.run(llm.aask(msg=msg))

llm_aask("你好，你是谁？")

2024-03-23 09:05:40.166 | DEBUG    | metagpt.provider.base_llm:aask:126 - [{'role': 'user', 'content': '你好，你是谁？'}]


我是一个大型语言模型，由 Google 训练。我是一个语言模型，可以提供各种各样的信息和帮助。


'我是一个大型语言模型，由 Google 训练。我是一个语言模型，可以提供各种各样的信息和帮助。'

In [4]:
import inspect
from metagpt.tools.tool_convert import function_docstring_to_schema
from metagpt.tools.libs.web_scraping import scrape_web_playwright
from tools.text_extractor.llm_extractor import llm_extractor

def function_to_schema(func):
    docstring = inspect.getdoc(func)
    schema = function_docstring_to_schema(func, docstring)
    schema["imports"] = f"from {func.__module__} import {func.__name__}"
    return schema

DEF_TOOLS = [
    ("web scraping", scrape_web_playwright),
    ("text extractor", llm_extractor),
]
tools = {}
for name, func in DEF_TOOLS:
    schema = function_to_schema(func)
    tools[name] = schema

task_types = "\n".join([
    f"**{k}**: {v['description']}" for k,v in tools.items()
])
print(task_types)

**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 
**text extractor**: Perform extraction on the 'content' text using a large language model. 


## Plan

In [5]:
import os
import json
from langchain.prompts import PromptTemplate
from metagpt.schema import Plan, Task
from metagpt.utils.common import OutputParser


def load_prompts(path: str) -> PromptTemplate:
    base_path = os.path.join("prompts", path)
    output_format = load_plaintext(base_path, "output.md")
    prompt = PromptTemplate(
        input_variables=[],
        template=load_plaintext(base_path, "template.yaml"),
    )
    return prompt.partial(output=output_format)

def parse_json(rsp):
    try:
        objs = json.loads(rsp)
    except:
        code_block = OutputParser.parse_code(rsp, "json")
        objs = json.loads(code_block)
    return objs

def create_plan(goal, guidance):
    plan_prompt = load_prompts("planning")
    template = plan_prompt.format(
        goal=goal,
        user_guidance=guidance,
        task_types=task_types,
        max_tasks=20,
    )
    logger.debug(template)

    plan = Plan(goal=goal)
    rsp = llm_aask(msg=template)

    tasks_json = parse_json(rsp)
    tasks = [Task(**task_config) for task_config in tasks_json]
    logger.debug(tasks)

    plan.add_tasks(tasks)
    return plan


user_goal = "抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档"
user_guidance = """大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md"""

plan = create_plan(goal=user_goal, guidance=user_guidance)
plan

2024-03-23 09:05:50.816 | DEBUG    | __main__:create_plan:33 - Respond to the human as helpfully and accurately as possible.

# User goal
抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档

大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md

# Available Task Types:
**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 
**text extractor**: Perform extraction on the 'content' text using a large language model. 

# Task:
Based on the user goal, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to 20 tasks.
If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.
If you encounter errors on the current task, revise and output the current single task only.
Output a list of jsons

## Plan

**Goal:** Grab content of '快讯' from the website "pitchhub.36kr.com/financing-flash" and organize it into a markdown file.

**Tasks:**

```json
[
    {
        "task_id": "1",
        "dependent_task_ids": [],
        "instruction": "Open the website 'pitchhub.36kr.com/financing-flash'.",
        "task_type": "web scraping"
    },
    {
        "task_id": "2",
        "dependent_task_ids": ["1"],
        "instruction": "Navigate to the '快讯' section on the website.",
        "task_type": "web scraping"
    },
    {
        "task_id": "3",
        "dependent_task_ids": ["2"],
        "instruction": "Copy the text content of the '快讯' section.",
        "task_type": "text extractor"
    },
    {
        "task_id": "4",
        "dependent_task_ids": ["3"],
        "instruction": "Save the copied text content into a separate file named '快讯.md'.",
        "task_type": "file management"
    }
]
```

**Notes:**

- This plan includes the tasks necessary to achieve the user goal of grabbi

2024-03-23 09:06:01.686 | DEBUG    | __main__:create_plan:40 - [Task(task_id='1', dependent_task_ids=[], instruction="Open the website 'pitchhub.36kr.com/financing-flash'.", task_type='web scraping', code='', result='', is_success=False, is_finished=False), Task(task_id='2', dependent_task_ids=['1'], instruction="Navigate to the '快讯' section on the website.", task_type='web scraping', code='', result='', is_success=False, is_finished=False), Task(task_id='3', dependent_task_ids=['2'], instruction="Copy the text content of the '快讯' section.", task_type='text extractor', code='', result='', is_success=False, is_finished=False), Task(task_id='4', dependent_task_ids=['3'], instruction="Save the copied text content into a separate file named '快讯.md'.", task_type='file management', code='', result='', is_success=False, is_finished=False)]


 should be performed for each task.


Plan(goal="抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档", context='', tasks=[Task(task_id='1', dependent_task_ids=[], instruction="Open the website 'pitchhub.36kr.com/financing-flash'.", task_type='web scraping', code='', result='', is_success=False, is_finished=False), Task(task_id='2', dependent_task_ids=['1'], instruction="Navigate to the '快讯' section on the website.", task_type='web scraping', code='', result='', is_success=False, is_finished=False), Task(task_id='3', dependent_task_ids=['2'], instruction="Copy the text content of the '快讯' section.", task_type='text extractor', code='', result='', is_success=False, is_finished=False), Task(task_id='4', dependent_task_ids=['3'], instruction="Save the copied text content into a separate file named '快讯.md'.", task_type='file management', code='', result='', is_success=False, is_finished=False)], task_map={'1': Task(task_id='1', dependent_task_ids=[], instruction="Open the website 'pitchhub.36kr.com/financing-fl

In [6]:
from pprint import pprint

pprint(plan.tasks)

[Task(task_id='1', dependent_task_ids=[], instruction="Open the website 'pitchhub.36kr.com/financing-flash'.", task_type='web scraping', code='', result='', is_success=False, is_finished=False),
 Task(task_id='2', dependent_task_ids=['1'], instruction="Navigate to the '快讯' section on the website.", task_type='web scraping', code='', result='', is_success=False, is_finished=False),
 Task(task_id='3', dependent_task_ids=['2'], instruction="Copy the text content of the '快讯' section.", task_type='text extractor', code='', result='', is_success=False, is_finished=False),
 Task(task_id='4', dependent_task_ids=['3'], instruction="Save the copied text content into a separate file named '快讯.md'.", task_type='file management', code='', result='', is_success=False, is_finished=False)]
