In [1]:
from __future__ import annotations

import sys
import asyncio
import nest_asyncio
from utils.common import load_plaintext

# debug level
from metagpt.logs import logger
logger.remove()
logger.add(sys.stderr, level="DEBUG")

# make asyncio.run() works in notebook
nest_asyncio.apply()

2024-03-24 15:15:36.095 | INFO     | metagpt.const:get_metagpt_package_root:29 - Package root set to /root/workspace/StreamChatPlayground/notebooks


## LLM 与 tools 准备

In [43]:
# 智普AI
# ~/.metagpt/config2.yaml
from metagpt.config2 import config
from metagpt.provider.zhipuai_api import ZhiPuAILLM
# from metagpt.utils.cost_manager import CostManager

llm = ZhiPuAILLM(config.llm)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

In [4]:
# vllm 本地
import yaml
from metagpt.configs.llm_config import LLMConfig
from metagpt.provider.openai_api import OpenAILLM
# from metagpt.utils.cost_manager import CostManager

llm_configs = yaml.safe_load(load_plaintext("../", "vllm_local.yaml"))
llm_config = LLMConfig.model_validate(llm_configs['llm'])
llm = OpenAILLM(llm_config)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

In [44]:
from functools import lru_cache

@lru_cache
def llm_aask(msg):
    return asyncio.run(llm.aask(msg=msg))

llm_aask("你好，介绍下你自己")

2024-03-24 15:43:48.991 | DEBUG    | metagpt.provider.base_llm:aask:149 - [{'role': 'user', 'content': '你好，介绍下你自己'}]


你好！我是智谱清言，是清华大学 KEG 实验室和智谱 AI 公司于 2023 年共同训练的语言模型。我的目标是通过回答用户提出的问题来帮助他们解决问题。由于我是一个计算机程序，所以我没有自我意识，也不能像人类一样感知世界。我只能通过分析我所学到的信息来回答问题。


'你好！我是智谱清言，是清华大学 KEG 实验室和智谱 AI 公司于 2023 年共同训练的语言模型。我的目标是通过回答用户提出的问题来帮助他们解决问题。由于我是一个计算机程序，所以我没有自我意识，也不能像人类一样感知世界。我只能通过分析我所学到的信息来回答问题。'

In [24]:
import json
import inspect
from metagpt.tools.tool_convert import function_docstring_to_schema
from metagpt.tools.libs.web_scraping import scrape_web_playwright
from tools.text_extractor.llm_extractor import llm_extractor

def function_to_schema(func):
    docstring = inspect.getdoc(func)
    schema = function_docstring_to_schema(func, docstring)
    schema["imports"] = f"from {func.__module__} import {func.__name__}"
    return schema

DEF_TOOLS = [
    ("web scraping", scrape_web_playwright),
    ("text extractor", llm_extractor),
]
tools = {}
for name, func in DEF_TOOLS:
    schema = function_to_schema(func)
    tools[name] = schema
tools_list = "\n".join([ json.dumps({k:v}) for k,v in tools.items() ])

task_types = "\n".join([
    f"**{k}**: {v['description']}" for k,v in tools.items()
])
print(task_types + "\n")
print(tools_list)

**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 
**text extractor**: Perform extraction on the 'content' text using a large language model. 

{"web scraping": {"type": "async_function", "description": "Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. ", "signature": "(url)", "parameters": "Args: url (str): The main URL to fetch inner text from. Returns: dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'.", "imports": "from metagpt.tools.libs.web_scraping import scrape_web_playwright"}}
{"text extractor": {"type": "async_function", "description": "Perform extraction on the 'content' text using a large language model. ", "signature": "(guidance: str, content: str, format: str) -> str", "parameters": "Args: guidance (str): Guide the extraction process. content (str): The text content that needs to be extracted. forma

## Plan

In [14]:
import os
from langchain.prompts import PromptTemplate
from metagpt.schema import Plan, Task
from metagpt.utils.common import OutputParser
from pprint import pprint # debug

def load_prompts(path: str, filename: str) -> PromptTemplate:
    base_path = os.path.join("prompts", path)
    output_format = load_plaintext(base_path, "output.md")
    prompt = PromptTemplate(
        input_variables=[],
        template=load_plaintext(base_path, filename),
    )
    return prompt.partial(output=output_format)

def parse_json(rsp):
    try:
        objs = json.loads(rsp)
    except:
        code_block = OutputParser.parse_code(rsp, "json")
        objs = json.loads(code_block)
    return objs

def create_plan(goal, guidance, last_plan=""):
    plan_prompt = load_prompts("planning", "planning.yaml")
    template = plan_prompt.format(
        goal=goal,
        user_guidance=guidance,
        last_plan=last_plan,
        task_types=task_types,
        max_tasks=20,
    )
    logger.debug(template)

    plan = Plan(goal=goal)
    plan.context = guidance
    rsp = llm_aask(msg=template)

    tasks_json = parse_json(rsp)
    tasks = [Task(**task_config) for task_config in tasks_json]
    logger.debug(tasks)

    plan.add_tasks(tasks)
    return plan, tasks_json


user_goal = "抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档"
user_guidance = """大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md"""

plan, raw_json = create_plan(goal=user_goal, guidance=user_guidance)
pprint(plan.tasks)

2024-03-24 15:22:00.938 | DEBUG    | __main__:create_plan:34 - Respond to the human as helpfully and accurately as possible.

# User goal
抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档

大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md



# Available Task Types
**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 
**text extractor**: Perform extraction on the 'content' text using a large language model. 

# Task
Based on the user goal, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to 20 tasks.
If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.
If you encounter errors on the current task, revise and output the current single task only.
Output a list of jsons

[Task(task_id='1', dependent_task_ids=[], instruction='Navigate to pitchhub.36kr.com/financing-flash', task_type='web scraping', code='', result='', is_success=False, is_finished=False),
 Task(task_id='2', dependent_task_ids=['1'], instruction='Extract the visible text content of the page', task_type='web scraping', code='', result='', is_success=False, is_finished=False),
 Task(task_id='3', dependent_task_ids=['2'], instruction="Find the text content containing the word '快讯'", task_type='text extractor', code='', result='', is_success=False, is_finished=False),
 Task(task_id='4', dependent_task_ids=['3'], instruction="Extract the specific content of '快讯'", task_type='text extractor', code='', result='', is_success=False, is_finished=False),
 Task(task_id='5', dependent_task_ids=['4'], instruction="Save the extracted content into a markdown file named '快讯.md'", task_type='other', code='', result='', is_success=False, is_finished=False)]


### Plan Review

In [15]:
review_prompt = load_prompts("planning", "review.yaml")
template = review_prompt.format(
    goal=user_goal,
    user_guidance=user_guidance,
    task_types=task_types,
    content=raw_json,
)
# logger.debug(template)

rsp = llm_aask(msg=template)
rsp

'## Modifiy suggestions:\n\n* **Task 2:** Instead of extracting all visible text content, consider extracting only the text content containing specific elements like headings, paragraphs, or even specific words. This would make the task more focused and reduce unnecessary processing.\n* **Task 3:** Instead of searching for the text content containing the word "快讯", consider using regular expressions to extract the specific content you want. This would make the task more precise and eliminate the need for manual review.\n* **Task 4:** Instead of extracting the entire content of "快讯", consider extracting specific sections or elements of the content that are relevant to your goal. This would make the extracted content more concise and easier to process.'

In [16]:
# TODO: 修改成 plan_review.yaml 模板
last_plan = f"""# Last plan
## Plan
```json
{raw_json}
```

## Review
{rsp}
"""

plan, raw_json = create_plan(goal=user_goal, guidance=user_guidance, last_plan=last_plan)
pprint(plan.tasks)

2024-03-24 15:22:11.730 | DEBUG    | __main__:create_plan:34 - Respond to the human as helpfully and accurately as possible.

# User goal
抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档

大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md

# Last plan
## Plan
```json
[{'task_id': '1', 'dependent_task_ids': [], 'instruction': 'Navigate to pitchhub.36kr.com/financing-flash', 'task_type': 'web scraping'}, {'task_id': '2', 'dependent_task_ids': ['1'], 'instruction': 'Extract the visible text content of the page', 'task_type': 'web scraping'}, {'task_id': '3', 'dependent_task_ids': ['2'], 'instruction': "Find the text content containing the word '快讯'", 'task_type': 'text extractor'}, {'task_id': '4', 'dependent_task_ids': ['3'], 'instruction': "Extract the specific content of '快讯'", 'task_type': 'text extractor'}, {'task_id': '5', 'dependent_task_ids': ['4'], 'instruction': "Save the extracted content into a markdown f

[Task(task_id='1', dependent_task_ids=[], instruction='Navigate to pitchhub.36kr.com/financing-flash', task_type='web scraping', code='', result='', is_success=False, is_finished=False),
 Task(task_id='2', dependent_task_ids=['1'], instruction="Extract the text content containing the heading '快讯' and its subsequent paragraphs", task_type='text extractor', code='', result='', is_success=False, is_finished=False),
 Task(task_id='3', dependent_task_ids=['2'], instruction="Save the extracted content into a markdown file named '快讯.md'", task_type='other', code='', result='', is_success=False, is_finished=False)]


## Tasks execute

In [18]:
from metagpt.actions.di.execute_nb_code import ExecuteNbCode

pre_execute = """import asyncio
import nest_asyncio
nest_asyncio.apply()
"""

execute_code = ExecuteNbCode()
await execute_code.run(pre_execute)

('', True)

In [45]:
def execute_task(plan: Plan, plan_status="", task_guidance=""):
    codegen_prompt = load_prompts("task_codegen", "task_codegen.yaml")
    template = codegen_prompt.format(
        plan_status=plan_status,
        current_task=plan.current_task.instruction,
        task_guidance=task_guidance,
        tools=tools_list,
    )
    logger.debug(template)

    rsp = llm_aask(msg=template)
    logger.debug(rsp)

    code_block = OutputParser.parse_code(rsp, "python")
    execute_code._display(code_block, language="python")
    return code_block

code = execute_task(plan)
code

2024-03-24 15:44:14.054 | DEBUG    | __main__:execute_task:9 - As an AI Engineer, you need to help user to achieve their goal step by step in a continuous Jupyter notebook.

## Current Task
Navigate to pitchhub.36kr.com/financing-flash

## Task Guidance
Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Specifically, 



# Tool Info

## Capabilities
- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python class or function.
- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..

## Available Tools:
Each tool is described in JSON format. When you call a tool, import the tool from imports first.
{"web scraping": {"type": "async_function", "description": "Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. ", "signature": "(url)", "parameters": "Args: url (str

To navigate to the specified URL and scrape its content, we will use the pre-defined tool `web scraping`. Here is the code that you can use in your Jupyter notebook to achieve the current task:

```python
from metagpt.tools.libs.web_scraping import scrape_web_playwright

# Define the URL to scrape
url = 'https://pitchhub.36kr.com/financing-flash'

# Asynchronously scrape the web page
scraped_data = await scrape_web_playwright(url)

# Output the scraped data
print(scraped_data)
```

This code will fetch the inner text and HTML structure of the web page at the given URL without duplicating any imports or previous tasks. Remember to run this code in a J

2024-03-24 15:44:19.761 | DEBUG    | __main__:execute_task:12 - To navigate to the specified URL and scrape its content, we will use the pre-defined tool `web scraping`. Here is the code that you can use in your Jupyter notebook to achieve the current task:

```python
from metagpt.tools.libs.web_scraping import scrape_web_playwright

# Define the URL to scrape
url = 'https://pitchhub.36kr.com/financing-flash'

# Asynchronously scrape the web page
scraped_data = await scrape_web_playwright(url)

# Output the scraped data
print(scraped_data)
```

This code will fetch the inner text and HTML structure of the web page at the given URL without duplicating any imports or previous tasks. Remember to run this code in a Jupyter notebook cell that is capable of handling asynchronous operations.


upyter notebook cell that is capable of handling asynchronous operations.


"from metagpt.tools.libs.web_scraping import scrape_web_playwright\n\n# Define the URL to scrape\nurl = 'https://pitchhub.36kr.com/financing-flash'\n\n# Asynchronously scrape the web page\nscraped_data = await scrape_web_playwright(url)\n\n# Output the scraped data\nprint(scraped_data)\n"

In [46]:
result, success = await execute_code.run(code)
success, result

(True,
 "{'inner_text': '首页\\n融资快报\\n融资事件\\n项目库\\n机构库\\n项目集\\n定向对接\\n融通创新\\n公司/项目名/投资机构/赛道\\n\\xa0\\n返回36氪\\n登录\\n融资快报\\n文章\\nPocketHealth获3300万美元B轮融资，研发医学影像共享平台 | 海外New Things\\n该公司推出了Report Reader，用于解释患者报告中的复杂医学术语，提高患者对医疗信息的理解。\\n2分钟前\\n海若镜\\n文章\\nClasp Therapeutics获1.5亿美元A轮融资，研发针对肿瘤的精准免疫疗法 | 海外New Things\\n重新引导T细胞杀死癌细胞，同时保护全身的健康细胞。\\n4分钟前\\n海若镜\\n文章\\nEngrail Therapeutics获1.57亿美元B轮融资，研发针对焦虑症等的疗法 | 海外New Things\\nEngrail应用精密化学和药理学，研发治疗焦虑症、抑郁症、创伤后应激障碍和罕见神经退行性疾病等具有重大未满...\\n8分钟前\\n海若镜\\n快讯\\n汽车智能化产业企业“博泰车联网”再获约15亿元股权融资\\n36氪获悉，汽车智能化产业企业“博泰车联网”近日成功斩获15亿元新一轮融资，将重点投入新一代智能化融域控产品的研发，并全面加速智驾域控的全栈技术创新。同时，公司计划深化舱驾融合解决方案及高性能中央计算平台，以上海为轴心，联动浙江、安徽、江苏，构建长三角一体化的智能制造与供应链新体系，区域布局未来灯塔工厂。\\n博泰车联网\\n战略融资\\n上海市\\n2009年成立\\n智能网联产品与技术服务提供商\\n6小时前\\n快讯\\n深圳“进化动力”完成C轮融资\\n36氪广东获悉，据“紫荆汇富”微信公众号消息，深圳进化动力数码科技有限公司（下称“进化动力”）完成C轮融资，投资方为紫荆汇富、知成基金、天瑞丰年、一桥基金。本轮融资资金将用于建设新生产线并加大布局赣州龙南地区。“进化动力”成立于2015年，公司提供端侧训练-学习芯片及系统平台，为供应链金融、保险、新商业、生鲜零售、智慧农贸、智慧城市、移动机器人等领域打造芯片产品及系统解决方案。原文链接\\n进化动力\\nB轮\\n广东省\\n2015年成立\\n商业视觉智能芯片及平台提供商\

In [None]:
status = f"""### Error
```
{result}
```

### Previous code
```python
{code}
```
"""

code = execute_task(plan, plan_status=status)
code