In [1]:
from __future__ import annotations

import sys
import asyncio
import nest_asyncio
from utils.common import load_plaintext

# debug level
from metagpt.logs import logger
logger.remove()
logger.add(sys.stderr, level="DEBUG")

# make asyncio.run() works in notebook
nest_asyncio.apply()

2024-04-10 19:52:21.098 | INFO     | metagpt.const:get_metagpt_package_root:29 - Package root set to /Users/deryzhou/Downloads/StreamChatPlayground/notebooks


## LLM 与 tools 准备

In [2]:
# 智普AI
# ~/.metagpt/config2.yaml
from metagpt.config2 import config
from metagpt.provider.zhipuai_api import ZhiPuAILLM
# from metagpt.utils.cost_manager import CostManager

llm = ZhiPuAILLM(config.llm)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

In [2]:
# 本地 OpenAI like (vllm/llama.cpp)
import yaml
from metagpt.configs.llm_config import LLMConfig
from metagpt.provider.openai_api import OpenAILLM
# from metagpt.utils.cost_manager import CostManager

llm_configs = yaml.safe_load(load_plaintext("../", "vllm_local.yaml"))
llm_config = LLMConfig.model_validate(llm_configs['llm'])
llm = OpenAILLM(llm_config)
llm.use_system_prompt = False # Disable default system message
# llm.cost_manager = CostManager()

models = await llm.aclient.models.list()
for idx, mod in enumerate(models.data):
    print(f"{idx+1}. {mod.id} ({mod.owned_by})")

1. mistral-7b-instruct-v0.2 (llamacpp)


In [2]:
# 混元
import yaml
from metagpt.configs.llm_config import LLMConfig
from provider.hunyuan_api import HunyuanAPI

llm_configs = yaml.safe_load(load_plaintext("../", "hunyuan.yaml"))
llm_config = LLMConfig.model_validate(llm_configs['hunyuan'])
llm = HunyuanAPI(llm_config, model="7b-code-sft-deryzhou", temperature=0.7)

In [3]:
from functools import lru_cache

@lru_cache
def llm_aask(msg, seed=None):
    return asyncio.run(llm.aask(msg=msg))

llm_aask("你好，介绍下你自己")

2024-04-10 19:52:25.628 | DEBUG    | metagpt.provider.base_llm:aask:149 - [{'role': 'user', 'content': '你好，介绍下你自己'}]


 你好！我是一名英语语言模型，我可以帮您生成和翻译文本。我可以回答问题，写短文本，翻译单词或短句，甚至是整个文章。我可以学习新知识和词汇，以适应您的需求。请告诉我如何帮您，我一定会尽力满足您的期望。

Hello! I am an English language model. I can help you generate and translate text. I can answer questions, write short texts, translate single words or short sentences, even whole articles. I can learn new knowledge and vocabulary to suit your needs. Please tell me how I can help you, I will do my best to meet your expectations.


' 你好！我是一名英语语言模型，我可以帮您生成和翻译文本。我可以回答问题，写短文本，翻译单词或短句，甚至是整个文章。我可以学习新知识和词汇，以适应您的需求。请告诉我如何帮您，我一定会尽力满足您的期望。\n\nHello! I am an English language model. I can help you generate and translate text. I can answer questions, write short texts, translate single words or short sentences, even whole articles. I can learn new knowledge and vocabulary to suit your needs. Please tell me how I can help you, I will do my best to meet your expectations.'

In [4]:
import json
import inspect
from metagpt.tools.tool_convert import function_docstring_to_schema
from metagpt.tools.libs.web_scraping import scrape_web_playwright
from tools.text_extractor.llm_extractor import llm_extractor

def function_to_schema(func):
    docstring = inspect.getdoc(func)
    schema = function_docstring_to_schema(func, docstring)
    schema["import"] = f"from {func.__module__} import {func.__name__}"
    return schema

DEF_TOOLS = [
    ("web scraping", scrape_web_playwright),
    ("text extractor", llm_extractor),
]
tools = {}
for name, func in DEF_TOOLS:
    schema = function_to_schema(func)
    tools[name] = schema
tools_list = "\n".join([ json.dumps({k:v}) for k,v in tools.items() ])

task_types = "\n".join([
    f"**{k}**: {v['description']}" for k,v in tools.items()
])
print(task_types + "\n")
print(tools_list)

**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 
**text extractor**: Perform extraction on the 'content' text using a large language model. 

{"web scraping": {"type": "async_function", "description": "Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. ", "signature": "(url)", "parameters": "Args: url (str): The main URL to fetch inner text from. Returns: dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'.", "import": "from metagpt.tools.libs.web_scraping import scrape_web_playwright"}}
{"text extractor": {"type": "async_function", "description": "Perform extraction on the 'content' text using a large language model. ", "signature": "(guidance: str, content: str, format: str) -> str", "parameters": "Args: guidance (str): Guide the extraction process. content (str): The text content that needs to be extracted. format

## Plan

In [8]:
import os
from langchain.prompts import PromptTemplate
from metagpt.schema import Plan, Task
from metagpt.utils.common import OutputParser
from pprint import pprint # debug

def load_prompts(path: str, filename: str) -> PromptTemplate:
    base_path = os.path.join("prompts", path)
    output_format = load_plaintext(base_path, "output.md")
    prompt = PromptTemplate(
        input_variables=[],
        template=load_plaintext(base_path, filename),
    )
    return prompt.partial(output=output_format)

def parse_json(rsp):
    try:
        objs = json.loads(rsp)
    except:
        code_block = OutputParser.parse_code(rsp, "json")
        objs = json.loads(code_block)
    return objs

def create_plan(goal, guidance, last_plan=""):
    plan_prompt = load_prompts("planning", "planning.yaml")
    template = plan_prompt.format(
        goal=goal,
        user_guidance=guidance,
        last_plan=last_plan,
        task_types=task_types,
        max_tasks=20,
    )
    logger.debug(template)

    plan = Plan(goal=goal)
    plan.context = guidance
    rsp = llm_aask(msg=template)

    tasks_json = parse_json(rsp)
    tasks = [Task(**task_config) for task_config in tasks_json]
    logger.debug(tasks)

    plan.add_tasks(tasks)
    return plan, tasks_json


user_goal = "抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档"
user_guidance = """# 可能的流程
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md"""

plan, raw_json = create_plan(goal=user_goal, guidance=user_guidance)
pprint(plan.tasks)

2024-04-10 19:53:27.586 | DEBUG    | __main__:create_plan:33 - Respond to the human as helpfully and accurately as possible.

# User goal
抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档

# 可能的流程
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md



# Available Task Types
**web scraping**: Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. 
**text extractor**: Perform extraction on the 'content' text using a large language model. 

# Task
Based on the user goal, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to 20 tasks.
If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.
If you encounter errors on the current task, revise and output the current single task only.
Output a list of json

 [
  {
    "task_id": "1",
    "dependent_task_ids": [],
    "instruction": "Use Playwright to scrape the HTML content of the webpage https://pitchhub.36kr.com/financing-flash",
    "task_type": "Scrape_HTML"
  },
  {
    "task_id": "2",
    "dependent_task_ids": ["1"],
    "instruction": "Extract the text content of the webpage using extracted HTML",
    "task_type": "Extract_Text"
  },
  {
    "task_id": "3",
    "dependent_task_ids": ["2"],
    "instruction": "Parse the extracted text to identify the desired information",
    "task_type": "Parse_Text"
  },
  {
    "task_id": "4",
    "dependent_task_ids": ["3"],
    "instruction": "Store the identified information in a variable for further use",
    "task_type": "Store_Information"
  }

2024-04-10 19:53:39.134 | DEBUG    | __main__:create_plan:41 - [Task(task_id='1', dependent_task_ids=[], instruction='Use Playwright to scrape the HTML content of the webpage https://pitchhub.36kr.com/financing-flash', task_type='Scrape_HTML', code='', result='', is_success=False, is_finished=False), Task(task_id='2', dependent_task_ids=['1'], instruction='Extract the text content of the webpage using extracted HTML', task_type='Extract_Text', code='', result='', is_success=False, is_finished=False), Task(task_id='3', dependent_task_ids=['2'], instruction='Parse the extracted text to identify the desired information', task_type='Parse_Text', code='', result='', is_success=False, is_finished=False), Task(task_id='4', dependent_task_ids=['3'], instruction='Store the identified information in a variable for further use', task_type='Store_Information', code='', result='', is_success=False, is_finished=False)]



]
[Task(task_id='1', dependent_task_ids=[], instruction='Use Playwright to scrape the HTML content of the webpage https://pitchhub.36kr.com/financing-flash', task_type='Scrape_HTML', code='', result='', is_success=False, is_finished=False),
 Task(task_id='2', dependent_task_ids=['1'], instruction='Extract the text content of the webpage using extracted HTML', task_type='Extract_Text', code='', result='', is_success=False, is_finished=False),
 Task(task_id='3', dependent_task_ids=['2'], instruction='Parse the extracted text to identify the desired information', task_type='Parse_Text', code='', result='', is_success=False, is_finished=False),
 Task(task_id='4', dependent_task_ids=['3'], instruction='Store the identified information in a variable for further use', task_type='Store_Information', code='', result='', is_success=False, is_finished=False)]


### Plan Review

In [23]:
review_prompt = load_prompts("planning", "review.yaml")
template = review_prompt.format(
    goal=user_goal,
    user_guidance=user_guidance,
    task_types=task_types,
    content=raw_json,
)
# logger.debug(template)

rsp = llm_aask(msg=template)
rsp

2024-04-08 21:48:46.003 | DEBUG    | metagpt.provider.base_llm:aask:149 - [{'role': 'user', 'content': 'Review the plan and determine if the plan can achieve the user goal.\n\n# User Goal\n抓取 https://pitchhub.36kr.com/financing-flash 中\'快讯\'的内容，并整理成markdown存档\n\n# Plan\n```json\n[{\'task_id\': \'1\', \'dependent_task_ids\': [], \'instruction\': \'Choose a programming language and set up the required tools for web scraping and text extraction.\', \'task_type\': \'setup\'}, {\'task_id\': \'2\', \'dependent_task_ids\': [\'1\'], \'instruction\': \'Clone the repository or create a new project directory for the web scraping and text extraction tasks.\', \'task_type\': \'file_management\'}, {\'task_id\': \'3\', \'dependent_task_ids\': [\'2\'], \'instruction\': \'Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash).\', \'task_type\': \'web_scraping\'}, {\'task_id\': \'4\', \'dependent_task_ids\': [\

The plan seems to be properly set up for achieving the user goal. The tasks are well-defined, and each step logically follows the previous one. The use of the provided task types also makes it easier to understand the purpose of each task. The plan format meets the output requirements.

However, there's one suggestion to make the plan more efficient:

Instruction for task 4 could be combined with task 3 to save time and resources:

```json
- Replace task 4 with:
  {"task_id": "3", "dependent_task_ids": ["2"], "instruction": "Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash), and parse the obtained HTML structure to extract the content within the 'quick-news' section.", "task_type": "web_scraping, html_parsing"}
```

This way, the scraping and parsing will be done in a single task, reducing the need to repeat the web scraping process.


'The plan seems to be properly set up for achieving the user goal. The tasks are well-defined, and each step logically follows the previous one. The use of the provided task types also makes it easier to understand the purpose of each task. The plan format meets the output requirements.\n\nHowever, there\'s one suggestion to make the plan more efficient:\n\nInstruction for task 4 could be combined with task 3 to save time and resources:\n\n```json\n- Replace task 4 with:\n  {"task_id": "3", "dependent_task_ids": ["2"], "instruction": "Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash), and parse the obtained HTML structure to extract the content within the \'quick-news\' section.", "task_type": "web_scraping, html_parsing"}\n```\n\nThis way, the scraping and parsing will be done in a single task, reducing the need to repeat the web scraping process.'

In [24]:
# TODO: 修改成 plan_review.yaml 模板
last_plan = f"""# Last plan
## Plan
```json
{raw_json}
```

## Review
{rsp}
"""

plan, raw_json = create_plan(goal=user_goal, guidance=user_guidance, last_plan=last_plan)
pprint(plan.tasks)

2024-04-08 21:48:52.022 | DEBUG    | __main__:create_plan:33 - Respond to the human as helpfully and accurately as possible.

# User goal
抓取 https://pitchhub.36kr.com/financing-flash 中'快讯'的内容，并整理成markdown存档

大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'快讯'相关的内容。注意网页中可能包含导航，只需要抽取'快讯'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: 快讯.md

# Last plan
## Plan
```json
[{'task_id': '1', 'dependent_task_ids': [], 'instruction': 'Choose a programming language and set up the required tools for web scraping and text extraction.', 'task_type': 'setup'}, {'task_id': '2', 'dependent_task_ids': ['1'], 'instruction': 'Clone the repository or create a new project directory for the web scraping and text extraction tasks.', 'task_type': 'file_management'}, {'task_id': '3', 'dependent_task_ids': ['2'], 'instruction': 'Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash).', 'task_type': 'web_scraping'}, {'task_id': '4', 'dependent_task

Given the user goal and the suggestion to combine the parsing step with the web scraping step, the updated plan is as follows:

```json
[{'task_id': '1', 'dependent_task_ids': [], 'instruction': 'Choose a programming language and set up the required tools for web scraping and text extraction.', 'task_type': 'setup'}, {'task_id': '2', 'dependent_task_ids': ['1'], 'instruction': 'Clone the repository or create a new project directory for the web scraping and text extraction tasks.', 'task_type': 'file_management'}, {'task_id': '3', 'dependent_task_ids': ['2'], 'instruction': 'Write a script using Playwright to scrape the HTML structure and inner text content of the specified URL (https://pitchhub.36kr.com/financing-flash) and parse the obtained HTML structure to extract the content within the \'quick-news\' section.', 'task_type': 'web_scraping, html_parsing'}, {'task_id': '4', 'dependent_task_ids': ['3'], 'instruction': "Use a large language model to extract the '快讯' content from the pa

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)

## Tasks execute

### playwright

In [9]:
from metagpt.actions.di.execute_nb_code import ExecuteNbCode

pre_execute = """import asyncio
import nest_asyncio
nest_asyncio.apply()
"""

# append imports
for _, t in tools.items():
    pre_execute += "\n" + t["import"]

execute_code = ExecuteNbCode()
result, success = await execute_code.run(pre_execute)

In [11]:
def execute_task(plan: Plan, plan_status="", task_guidance=""):
    codegen_prompt = load_prompts("task_codegen", "task_codegen.yaml")
    template = codegen_prompt.format(
        plan_status=plan_status,
        current_task=plan.current_task.instruction,
        task_guidance=task_guidance,
        tools=tools_list,
    )
    logger.debug(template)

    rsp = llm_aask(msg=template, seed=123)
    logger.debug(rsp)

    code_block = OutputParser.parse_code(rsp, "python")
    execute_code._display(code_block, language="python")
    return code_block

code = execute_task(plan, task_guidance="所有依赖均已经导入，无需提供pip或者环境相关内容")
code

2024-04-10 19:54:23.775 | DEBUG    | __main__:execute_task:9 - As an AI Engineer, you need to help user to achieve their goal step by step in a continuous Jupyter notebook.

## Current Task
Use Playwright to scrape the HTML content of the webpage https://pitchhub.36kr.com/financing-flash

## Task Guidance
Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Specifically, 所有依赖均已经导入，无需提供pip或者环境相关内容



# Tool Info

## Capabilities
- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python class or function.
- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..

## Available Tools:
Each tool is described in JSON format. All tools was import by default.
{"web scraping": {"type": "async_function", "description": "Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. "

 Based on the given instructions, here's an example of how you might implement the `scrape_web_playwright` function using Playwright:

```python
import asyncio
from playwright.async_api import Playwright, async_playwright

async def scrape_web_playwright(url):
    async with async_playwright() as p:
        context = await p.chromium.launch()
        page = await context.new_page()
        await page.goto(url)
        inner_text = await page.innerText()
        html = await page.content()
        await context.close()
        return inner_text, html

async def main():
    url = "https://example.com"
    inner_text, html = await scrape_web_playwright(url)
    print(f"Inner text: {inner_text}")
    print(f"HTML: {html}")

if __name__ == "__main__":
    asyncio.run(main())
```

This code uses the `async_playwright` library to launch a Chromium browser, navigate to a URL, extract the inner text and HTML content, and then close the browser. The `main` function is defined as an async functio

2024-04-10 19:54:45.388 | DEBUG    | __main__:execute_task:12 -  Based on the given instructions, here's an example of how you might implement the `scrape_web_playwright` function using Playwright:

```python
import asyncio
from playwright.async_api import Playwright, async_playwright

async def scrape_web_playwright(url):
    async with async_playwright() as p:
        context = await p.chromium.launch()
        page = await context.new_page()
        await page.goto(url)
        inner_text = await page.innerText()
        html = await page.content()
        await context.close()
        return inner_text, html

async def main():
    url = "https://example.com"
    inner_text, html = await scrape_web_playwright(url)
    print(f"Inner text: {inner_text}")
    print(f"HTML: {html}")

if __name__ == "__main__":
    asyncio.run(main())
```

This code uses the `async_playwright` library to launch a Chromium browser, navigate to a URL, extract the inner text and HTML content, and then close

 object.


'import asyncio\nfrom playwright.async_api import Playwright, async_playwright\n\nasync def scrape_web_playwright(url):\n    async with async_playwright() as p:\n        context = await p.chromium.launch()\n        page = await context.new_page()\n        await page.goto(url)\n        inner_text = await page.innerText()\n        html = await page.content()\n        await context.close()\n        return inner_text, html\n\nasync def main():\n    url = "https://example.com"\n    inner_text, html = await scrape_web_playwright(url)\n    print(f"Inner text: {inner_text}")\n    print(f"HTML: {html}")\n\nif __name__ == "__main__":\n    asyncio.run(main())\n'

In [11]:
result, success = await execute_code.run(code)
success, result

(True,
 '首页\n融资快报\n融资事件\n项目库\n机构库\n项目集\n定向对接\n融通创新\n公司/项目名/投资机构/赛道\n\xa0\n返回36氪\n登录\n融资快报\n快讯\n“特斯联”完成20亿元D轮融资\n36氪获悉，人工智能物联网（AIoT）企业“特斯联”近日完成D轮20亿元融资。本轮融资由AL Capital、阳明股权投资基金共同领投，福田资本、金地集团、重科控股、数字重庆、南昌政府平台公司、徐州产业基金、北科建集团、光大控股、商汤科技等新老股东跟投。本轮融资后，特斯联将进一步夯实数智化基础设施，深化“模型+系统”的比特大模型开放平台。原文链接\n特斯联\n战略融资\n重庆市\n2015年成立\n城市级智能物联网服务提供商\n5小时前\n快讯\n无人机研发及赛事运营商“美冠挚友”获天使+轮融资\n36氪获悉，无人机研发及赛事运营商“美冠挚友（星奇世界HISINGY）”宣布完成天使+轮融资，本轮融资由中科优势旗下探方资本领投，老股东火凤资本、真成投资持续跟投，探究资本担任独家财务顾问。本轮融资资金将用于“低价格高品质”的全新娱乐穿越机套装产品的研发及投产、新消费场景的布局及各传统玩具线下渠道推进。原文链接\n6小时前\n文章\n\u200b36氪广东首发｜AI Infra供应商「星凡科技」获近亿元Pre-A轮融资，赋能算力中心建设运营\n2023年公司全年预计实现过亿元营收。\n星凡科技\n天使轮\n四川省\n2021年成立\n元宇宙基础数字平台\n6小时前\n许璧端@36氪广东\n文章\n「特斯联」完成20亿元D轮融资，加速模型+系统在AIoT领域落地｜硬氪独家\n特斯联初探领域大模型应用，加速企业数智化转型。\n特斯联\n战略融资\n重庆市\n2015年成立\n城市级智能物联网服务提供商\n6小时前\n黄 楠\n文章\n「美冠挚友（星奇世界HISINGY）」获天使+轮融资，持续推动娱乐无人机与低空经济发展\n这是星奇世界自2022年完成天使轮和战略轮后的第三轮融资，团队正在拓展无人机与潮玩的边界。\n星奇世界HISINGY\n战略融资\n北京市\n2018年成立\n潮玩品牌\n6小时前\n刘士武\n文章\n云澎科技完成数千万元A轮融资\n由亚投资本投资。\n21小时前\n时氪分享\n文章\n世纪云安完成数亿元A+轮融资、华发集团与华为在全屋智

In [10]:
status = f"""### Error
抓取内容不符合预期。请检查输入的URL，需要抓取的是 https://pitchhub.36kr.com/financing-flash

```
{result}
```

### Previous code
```python
{code}
```
"""

code = execute_task(plan, plan_status=status)
code

2024-04-09 15:25:04.613 | DEBUG    | __main__:execute_task:9 - As an AI Engineer, you need to help user to achieve their goal step by step in a continuous Jupyter notebook.

## Current Task
Navigate to the Pitchhub.36kr.com website

## Task Guidance
Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Specifically, 

### Error
抓取内容不符合预期。请检查输入的URL，需要抓取的是 https://pitchhub.36kr.com/financing-flash

```
首页
融资快报
融资事件
项目库
机构库
项目集
定向对接
融通创新
公司/项目名/投资机构/赛道
 
返回36氪
登录
认证投资人专享-助力对接优质项目
为投资人对接全市场创业项目——媒体独家资源，快速链接创始团队，深挖水下项目，助力高效Deal Sourcing
36氪创投平台正式推出企业融资一站式解决方案——36氪创投平台融资加速服务
36氪创投平台基于媒体业务十余年积累，整合内部投资人资源、王牌路演活动、企业号产品和财务顾问服务为创业企业提供全方位的融资加速服务，目前已与数百家头部投资机构建立深度合作，67%的企业被36氪曝光或服务后成功获得了融资。
认证投资人专享-助力对接优质项目
为投资人对接全市场创业项目——媒体独家资源，快速链接创始团队，深挖水下项目，助力高效Deal Sourcing
36氪创投平台正式推出企业融资一站式解决方案——36氪创投平台融资加速服务
36氪创投平台基于媒体业务十余年积累，整合内部投资人资源、王牌路演活动、企业号产品和财务顾问服务为创业企业提供全方位的融资加速服务，目前已与数百家头部投资机构建立深度合作，67%的企业被36氪曝光或服务后成功获得了融资。

```python
from metagpt.tools.libs.web_scraping import scrape_web_playwright

url = "https://pitchhub.36kr.com/financing-flash"

# Scrape the web page using Playwright
result = await scrape_web_playwright(url)

# Extract text from the scraped HTML
text = result['inner_text']

# Output the extracted text

2024-04-09 15:25:08.321 | INFO     | metagpt.utils.cost_manager:update_cost:108 - prompt_tokens: 1883, completion_tokens: 100
2024-04-09 15:25:08.323 | DEBUG    | __main__:execute_task:12 - ```python
from metagpt.tools.libs.web_scraping import scrape_web_playwright

url = "https://pitchhub.36kr.com/financing-flash"

# Scrape the web page using Playwright
result = await scrape_web_playwright(url)

# Extract text from the scraped HTML
text = result['inner_text']

# Output the extracted text
print(text)
```



print(text)
```


'from metagpt.tools.libs.web_scraping import scrape_web_playwright\n\nurl = "https://pitchhub.36kr.com/financing-flash"\n\n# Scrape the web page using Playwright\nresult = await scrape_web_playwright(url)\n\n# Extract text from the scraped HTML\ntext = result[\'inner_text\']\n\n# Output the extracted text\nprint(text)\n'

In [14]:
from metagpt.schema import TaskResult

plan.current_task.update_task_result(task_result=TaskResult(code=code, result=result, is_success=success))
plan.finish_current_task()

task_id='1' dependent_task_ids=[] instruction='Navigate to the Pitchhub.36kr.com website' task_type='web scraping' code='from metagpt.tools.libs.web_scraping import scrape_web_playwright\n\nurl = "https://pitchhub.36kr.com/financing-flash"\n\n# Scrape the web page using Playwright\nresult = await scrape_web_playwright(url)\n\n# Extract text from the scraped HTML\ntext = result[\'inner_text\']\n\n# Output the extracted text\nprint(text)\n' result='首页\n融资快报\n融资事件\n项目库\n机构库\n项目集\n定向对接\n融通创新\n公司/项目名/投资机构/赛道\n\xa0\n返回36氪\n登录\n融资快报\n快讯\n“特斯联”完成20亿元D轮融资\n36氪获悉，人工智能物联网（AIoT）企业“特斯联”近日完成D轮20亿元融资。本轮融资由AL Capital、阳明股权投资基金共同领投，福田资本、金地集团、重科控股、数字重庆、南昌政府平台公司、徐州产业基金、北科建集团、光大控股、商汤科技等新老股东跟投。本轮融资后，特斯联将进一步夯实数智化基础设施，深化“模型+系统”的比特大模型开放平台。原文链接\n特斯联\n战略融资\n重庆市\n2015年成立\n城市级智能物联网服务提供商\n5小时前\n快讯\n无人机研发及赛事运营商“美冠挚友”获天使+轮融资\n36氪获悉，无人机研发及赛事运营商“美冠挚友（星奇世界HISINGY）”宣布完成天使+轮融资，本轮融资由中科优势旗下探方资本领投，老股东火凤资本、真成投资持续跟投，探究资本担任独家财务顾问。本轮融资资金将用于“低价格高品质”的全新娱乐穿越机套装产品的研发及投产、新消费场景的布局及各传统玩具线下渠道推进。原文链接\n6小时前\n文章\n\u200b36氪广东首发｜AI Infra供应商「

### llm extractor

In [17]:
plan_status = """## Finished Tasks
"""

code = execute_task(plan, plan_status=plan_status, task_guidance="仅专注于当前任务，上一步执行的变量可以直接使用")
code

2024-04-09 15:43:27.916 | DEBUG    | __main__:execute_task:9 - As an AI Engineer, you need to help user to achieve their goal step by step in a continuous Jupyter notebook.

## Current Task
Extract the HTML structure and inner text content of the 'financing-flash' page

## Task Guidance
Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Specifically, 仅专注于当前任务，上一步执行的变量可以直接使用



# Tool Info

## Capabilities
- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python class or function.
- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..

## Available Tools:
Each tool is described in JSON format. All tools was import by default.
{"web scraping": {"type": "async_function", "description": "Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. ", "signature": "(url)

```python
from metagpt.tools.libs.web_scraping import scrape_web_playwright
from tools.text_extractor.llm_extractor import llm_extractor

# Scrape the HTML structure and inner text content of the 'financing-flash' page
url = "https://finance.sina.com.cn/flash/hkhc/2023-03-24/doc-imukvuxu9025020.d.html"
result = await scrape_web_playwright(url)

# Extract the inner text content from the scraped HTML
inner_text = result['inner_text']

# Extract specific information from the inner text using the text extractor
extraction_guidance = "Extract the title, time, and content from the article."
extracted_info = llm_extractor(extraction_guidance, inner_text, format="json")

# Print the extracted information

2024-04-09 15:43:32.921 | INFO     | metagpt.utils.cost_manager:update_cost:108 - prompt_tokens: 567, completion_tokens: 219
2024-04-09 15:43:32.927 | DEBUG    | __main__:execute_task:12 - ```python
from metagpt.tools.libs.web_scraping import scrape_web_playwright
from tools.text_extractor.llm_extractor import llm_extractor

# Scrape the HTML structure and inner text content of the 'financing-flash' page
url = "https://finance.sina.com.cn/flash/hkhc/2023-03-24/doc-imukvuxu9025020.d.html"
result = await scrape_web_playwright(url)

# Extract the inner text content from the scraped HTML
inner_text = result['inner_text']

# Extract specific information from the inner text using the text extractor
extraction_guidance = "Extract the title, time, and content from the article."
extracted_info = llm_extractor(extraction_guidance, inner_text, format="json")

# Print the extracted information
print(extracted_info)
```



print(extracted_info)
```


'from metagpt.tools.libs.web_scraping import scrape_web_playwright\nfrom tools.text_extractor.llm_extractor import llm_extractor\n\n# Scrape the HTML structure and inner text content of the \'financing-flash\' page\nurl = "https://finance.sina.com.cn/flash/hkhc/2023-03-24/doc-imukvuxu9025020.d.html"\nresult = await scrape_web_playwright(url)\n\n# Extract the inner text content from the scraped HTML\ninner_text = result[\'inner_text\']\n\n# Extract specific information from the inner text using the text extractor\nextraction_guidance = "Extract the title, time, and content from the article."\nextracted_info = llm_extractor(extraction_guidance, inner_text, format="json")\n\n# Print the extracted information\nprint(extracted_info)\n'

In [12]:
execute_code._display(result, language="markdown")

Output()