In [1]:
from __future__ import annotations

import asyncio
import nest_asyncio

# make asyncio.run() works in notebook
nest_asyncio.apply()

In [2]:
from metagpt.config2 import Config
from metagpt.provider.llm_provider_registry import create_llm_instance
from metagpt.utils.cost_manager import CostManager

cfg = Config.default()

llm = create_llm_instance(cfg.llm)
# llm.cost_manager = CostManager()
llm

2024-03-17 09:04:26.206 | INFO     | metagpt.const:get_metagpt_package_root:29 - Package root set to /root/workspace/StreamChatPlayground/notebooks


<metagpt.provider.zhipuai_api.ZhiPuAILLM at 0x7f7aaba6e050>

In [7]:
asyncio.run(llm.aask("你好"))

Of course, I'm here to help. How can I assist you

2024-03-16 23:03:23.170 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.000 | Max budget: $10.000 | Current cost: $0.000, prompt_tokens: 14, completion_tokens: 18


 today?


"Of course, I'm here to help. How can I assist you today?"

In [9]:
import json
from typing import Union, Literal
from pydantic import Field, model_validator

from metagpt.logs import logger
from metagpt.roles import Role
from metagpt.schema import Message, Task, TaskResult
from metagpt.strategy.task_type import TaskType

from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender
from metagpt.actions.di.execute_nb_code import ExecuteNbCode
from metagpt.actions.di.write_analysis_code import CheckData, WriteAnalysisCode
from metagpt.prompts.di.write_analysis_code import DATA_INFO
from metagpt.utils.common import CodeParser

In [10]:
REACT_THINK_PROMPT = """
# User Requirement
{user_requirement}
# Context
{context}

Output a json following the format:
```json
{{
    "thoughts": str = "Thoughts on current situation, reflect on how you should proceed to fulfill the user requirement",
    "state": bool = "Decide whether you need to take more actions to complete the user requirement. Return true if you think so. Return false if you think the requirement has been completely fulfilled."
}}
```
"""

class CodeInterpreter(Role):
    name: str = "OpenDevin"
    profile: str = "CodeInterpreter"
    auto_run: bool = True
    use_plan: bool = True
    use_reflection: bool = False
    execute_code: ExecuteNbCode = Field(default_factory=ExecuteNbCode, exclude=True)
    tools: Union[str, list[str]] = []  # Use special symbol ["<all>"] to indicate use of all registered tools
    tool_recommender: ToolRecommender = None
    react_mode: Literal["plan_and_act", "react"] = "plan_and_act"
    max_react_loop: int = 10  # used for react mode

    @model_validator(mode="after")
    def set_plan_and_tool(self) -> "Interpreter":
        self._set_react_mode(react_mode=self.react_mode, max_react_loop=self.max_react_loop, auto_run=self.auto_run)
        self.use_plan = (
            self.react_mode == "plan_and_act"
        )  # create a flag for convenience, overwrite any passed-in value
        if self.tools:
            self.tool_recommender = BM25ToolRecommender(tools=self.tools)
        self.set_actions([WriteAnalysisCode])
        self._set_state(0)
        return self

    @property
    def working_memory(self):
        return self.rc.working_memory

    async def _think(self) -> bool:
        """Useful in 'react' mode. Use LLM to decide whether and what to do next."""
        user_requirement = self.get_memories()[0].content
        context = self.working_memory.get()

        if not context:
            # just started the run, we need action certainly
            self.working_memory.add(self.get_memories()[0])  # add user requirement to working memory
            self._set_state(0)
            return True

        prompt = REACT_THINK_PROMPT.format(user_requirement=user_requirement, context=context)
        rsp = await self.llm.aask(prompt)
        rsp_dict = json.loads(CodeParser.parse_code(block=None, text=rsp))
        self.working_memory.add(Message(content=rsp_dict["thoughts"], role="assistant"))
        need_action = rsp_dict["state"]
        self._set_state(0) if need_action else self._set_state(-1)

        return need_action

    async def _act(self) -> Message:
        """Useful in 'react' mode. Return a Message conforming to Role._act interface."""
        code, _, _ = await self._write_and_exec_code()
        return Message(content=code, role="assistant", cause_by=WriteAnalysisCode)

    async def _plan_and_act(self) -> Message:
        rsp = await super()._plan_and_act()
        await self.execute_code.terminate()
        return rsp

    async def _act_on_task(self, current_task: Task) -> TaskResult:
        """Useful in 'plan_and_act' mode. Wrap the output in a TaskResult for review and confirmation."""
        code, result, is_success = await self._write_and_exec_code()
        task_result = TaskResult(code=code, result=result, is_success=is_success)
        return task_result

    async def _write_and_exec_code(self, max_retry: int = 3):
        counter = 0
        success = False

        # plan info
        plan_status = self.planner.get_plan_status() if self.use_plan else ""

        # tool info
        if self.tools:
            context = (
                self.working_memory.get()[-1].content if self.working_memory.get() else ""
            )  # thoughts from _think stage in 'react' mode
            plan = self.planner.plan if self.use_plan else None
            tool_info = await self.tool_recommender.get_recommended_tool_info(context=context, plan=plan)
        else:
            tool_info = ""

        # data info
        await self._check_data()

        while not success and counter < max_retry:
            ### write code ###
            code, cause_by = await self._write_code(counter, plan_status, tool_info)

            self.working_memory.add(Message(content=code, role="assistant", cause_by=cause_by))

            ### execute code ###
            result, success = await self.execute_code.run(code)
            print(result)

            self.working_memory.add(Message(content=result, role="user", cause_by=ExecuteNbCode))

            ### process execution result ###
            counter += 1

            if not success and counter >= max_retry:
                logger.info("coding failed!")
                counter = 0  # redo the task again with help of human suggestions
                # review, _ = await self.planner.ask_review(auto_run=False, trigger=ReviewConst.CODE_REVIEW_TRIGGER)
                # if ReviewConst.CHANGE_WORDS[0] in review:
                #     counter = 0  # redo the task again with help of human suggestions

        return code, result, success

    async def _write_code(
        self,
        counter: int,
        plan_status: str = "",
        tool_info: str = "",
    ):
        todo = self.rc.todo  # todo is WriteAnalysisCode
        logger.info(f"ready to {todo.name}")
        use_reflection = counter > 0 and self.use_reflection  # only use reflection after the first trial

        user_requirement = self.get_memories()[0].content

        code = await todo.run(
            user_requirement=user_requirement,
            plan_status=plan_status,
            tool_info=tool_info,
            working_memory=self.working_memory.get(),
            use_reflection=use_reflection,
        )

        return code, todo

    async def _check_data(self):
        if (
            not self.use_plan
            or not self.planner.plan.get_finished_tasks()
            or self.planner.plan.current_task.task_type
            not in [
                TaskType.DATA_PREPROCESS.type_name,
                TaskType.FEATURE_ENGINEERING.type_name,
                TaskType.MODEL_TRAIN.type_name,
            ]
        ):
            return
        logger.info("Check updated data")
        code = await CheckData().run(self.planner.plan)
        if not code.strip():
            return
        result, success = await self.execute_code.run(code)
        if success:
            print(result)
            data_info = DATA_INFO.format(info=result)
            self.working_memory.add(Message(content=data_info, role="user", cause_by=CheckData))


In [11]:
url = "https://pitchhub.36kr.com/financing-flash"
domain = "快讯"

prompt = f"""抓取{url}中'{domain}'的内容，并整理成markdown存档。

大概的流程：
- 使用工具抓取网页中的可见文本
- 提取网页文本中'{domain}'相关的内容。注意网页中可能包含导航，只需要抽取'{domain}'的具体内容
- 对抽取结果进行归类，并保存成markdown表格: {domain}.md
"""

ci = CodeInterpreter(use_tools=True, tools=["scrape_web_playwright"])
asyncio.run(ci.run(prompt))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.327 seconds.
Prefix dict has been built successfully.


To achieve the goal of scraping the '快讯' content from the provided webpage and organizing it into a Markdown file, the following plan is proposed:

```json
[
    {
        "task_id": "1",
        "dependent_task_ids": [],
        "instruction": "Scrape visible text from the webpage",
        "task_type": "web scraping"
    },
    {
        "task_id": "2",
        "dependent_task_ids": ["1"],
        "instruction": "Extract content related to '快讯'",
        "task_type": "data preprocessing"
    },
    {
        "task_id": "3",
        "dependent_task_ids": ["2"],
        "instruction": "Categorize and format the extracted content into a Markdown table",
        "task_type": "other"
    },
    {
        "task_id": "4",
        "dependent_task_ids": ["3"],
        "instruction": "Save the formatted content into a file named '快讯.md'",
        "task_type": "other"
    }
]
```

This plan outlines a sequence of tasks starting with web scraping, followed by data preprocessing to extract releva

2024-03-17 09:10:03.164 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.010 | Max budget: $10.000 | Current cost: $0.010, prompt_tokens: 474, completion_tokens: 255
2024-03-17 09:10:03.166 | INFO     | metagpt.roles.role:_plan_and_act:492 - ready to take on task task_id='1' dependent_task_ids=[] instruction='Scrape visible text from the webpage' task_type='web scraping' code='' result='' is_success=False is_finished=False
2024-03-17 09:10:03.167 | INFO     | metagpt.tools.tool_recommend:recall_tools:195 - Recalled tools: 
['scrape_web_playwright']; Scores: [-4.028245058449738]


 it to a file.


2024-03-17 09:10:04.315 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.003 | Max budget: $10.000 | Current cost: $0.003, prompt_tokens: 172, completion_tokens: 13
2024-03-17 09:10:04.316 | INFO     | metagpt.tools.tool_recommend:recommend_tools:101 - Recommended tools: 
['scrape_web_playwright']
2024-03-17 09:10:04.317 | INFO     | __main__:_write_code:131 - ready to WriteAnalysisCode


```json
["scrape_web_playwright"]
```
Given the constraints and the task at hand, we will first need to scrape the visible text from the webpage using the provided `scrape_web_playwright` tool. Here's how you can do it:

```python
# Import the scraping tool
from metagpt.tools.libs.web_scraping import scrape_web_playwright

# URL to be scraped
url = 'https://pitchhub.36kr.com/financing-flash'

# Async function call to scrape the webpage
async def fetch_page_content(url):
    return await scrape_web_playwright(url)

# Execute the async function within a Jupyter notebook environment
import asyncio
page_content = asyncio.get_event_loop().run_until_complete(fetch_page_content(url))

# Extract the inner text from the returned content
inner_text = page_content['inner_text']

# Now you have the inner text of the webpage. The next step would be to filter out the '快讯' content.
# However, since this is beyond the scope of the current task, we will stop here.

# Output the result - this will be us

2024-03-17 09:10:15.912 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.012 | Max budget: $10.000 | Current cost: $0.012, prompt_tokens: 505, completion_tokens: 345


 would be handled in subsequent code blocks and tasks.


2024-03-17 09:10:32.680 | INFO     | __main__:_write_code:131 - ready to WriteAnalysisCode


,---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[1], line 13
     11 # Execute the async function within a Jupyter notebook environment
     12 import asyncio
---> 13 page_content = asyncio.get_event_loop().run_until_complete(fetch_page_content(url))
     15 # Extract the inner text from the returned content
     16 inner_text = page_content['inner_text']

File /usr/lib/python3.10/asyncio/base_events.py:625, in BaseEventLoop.run_until_complete(self, future)
    614 """Run until the Future is done.
    615 
    616 If the argument is a coroutine, it is wrapped in a Task.
   (...)
    622 Return the Future's result, or raise its exception.
    623 """
    624 self._check_closed()
--> 625 self._check_running()
    627 new_task = not futures.isfuture(future)
    628 future = tasks.ensure_future(future, loop=self)

File /usr/lib/python3.10/asyncio/base_events.py:584, in BaseEventL

2024-03-17 09:10:45.206 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.031 | Max budget: $10.000 | Current cost: $0.019, prompt_tokens: 1028, completion_tokens: 361


 running event loop in the notebook.


2024-03-17 09:10:45.243 | INFO     | __main__:_write_code:131 - ready to WriteAnalysisCode


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[2], line 18
     15     print(inner_text)
     17 # Run the main async function
---> 18 asyncio.run(main())

File /usr/lib/python3.10/asyncio/runners.py:33, in run(main, debug)
      9 """Execute the coroutine and return the result.
     10 
     11 This function runs the passed coroutine, taking care of
   (...)
     30     asyncio.run(main())
     31 """
     32 if events._get_running_loop() is not None:
---> 33     raise RuntimeError(
     34         "asyncio.run() cannot be called from a running event loop")
     36 if not coroutines.iscoroutine(main):
     37     raise ValueError("a coroutine was expected, got {!r}".format(main))

RuntimeError: asyncio.run() cannot be called from a running event loop
Apologies for the confusion earlier. When working within a Jupyter notebook, you should avoid using `asyncio.run()` if the ev

2024-03-17 09:10:55.081 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.054 | Max budget: $10.000 | Current cost: $0.023, prompt_tokens: 1355, completion_tokens: 281


 environment without using `asyncio.run()`.


2024-03-17 09:10:55.093 | INFO     | metagpt.roles.role:_plan_and_act:492 - ready to take on task task_id='2' dependent_task_ids=['1'] instruction="Extract content related to '快讯'" task_type='data preprocessing' code='' result='' is_success=False is_finished=False
2024-03-17 09:10:55.095 | INFO     | metagpt.tools.tool_recommend:recall_tools:195 - Recalled tools: 
['scrape_web_playwright']; Scores: [-3.8368201899696874]



```json
["scrape_web_play

2024-03-17 09:10:56.126 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.003 | Max budget: $10.000 | Current cost: $0.003, prompt_tokens: 172, completion_tokens: 13
2024-03-17 09:10:56.127 | INFO     | metagpt.tools.tool_recommend:recommend_tools:101 - Recommended tools: 
['scrape_web_playwright']
2024-03-17 09:10:56.128 | INFO     | __main__:_check_data:158 - Check updated data


wright"]
```
It seems like you want an assistant to provide the code snippet that would print out the column information of a DataFrame variable from the finished tasks. However, the `Finished Tasks` code you provided does not contain any DataFrame variable that we can use directly. 

Given the constraints and the task, I will assume that the `df` variable is defined somewhere in your previous code cells. Below is the code snippet that you can add to a new cell in your Jupyter notebook to print the column information:

```python
# Assuming df is the DataFrame variable from the finished tasks
from metagpt.tools.libs.data_preprocess import get_column_info

# Replace 'df' with the actual variable name of your DataFrame if it's different
column_info = get_column_info(df)
print("column_info:")
print(column_info)
```

Make sure to replace `'df'` with the actual variable name of your DataFrame if it's different. This code snippet assumes that the `get_column_info` function from the `metagpt.t

2024-03-17 09:11:03.472 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.006 | Max budget: $10.000 | Current cost: $0.006, prompt_tokens: 199, completion_tokens: 228


 a DataFrame.


2024-03-17 09:11:03.493 | INFO     | __main__:_write_code:131 - ready to WriteAnalysisCode


Given the constraints and the task at hand, we will use the provided `scrape_web_playwright` function to extract the inner text content of the web page. Then, we will process this text to extract the '快讯' related content and format it into a markdown table.

Below is the Python code to achieve this:

```python
from metagpt.tools.libs.web_scraping import scrape_web_playwright

# Current Task: Extract content related to '快讯'
async def extract_financing_flash(url):
    # Scrape the web page
    result = await scrape_web_playwright(url)
    inner_text = result['inner_text']
    
    # Split the text into lines for processing
    lines = inner_text.split('\n')
    
    # Initialize a list to hold all '快讯' related content
    financing_flash_content = []
    
    # Iterate through lines to find and process '快讯' content
    for line in lines:
        if '快讯' in line:
            # Assuming '快讯' marks the beginning of a new entry
            # Add the line (or processed version of it) to the l

2024-03-17 09:11:18.872 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.068 | Max budget: $10.000 | Current cost: $0.014, prompt_tokens: 515, completion_tokens: 462


 actual content structure and requirements for the final output.


2024-03-17 09:11:18.889 | INFO     | metagpt.roles.role:_plan_and_act:492 - ready to take on task task_id='3' dependent_task_ids=['2'] instruction='Categorize and format the extracted content into a Markdown table' task_type='other' code='' result='' is_success=False is_finished=False
2024-03-17 09:11:18.890 | INFO     | metagpt.tools.tool_recommend:recall_tools:195 - Recalled tools: 
['scrape_web_playwright']; Scores: [-6.984035263675844]



```json
["scrape_web_playwright"]
````

The 'scrape_web_playwright' tool is the most relevant for the 'User Requirement' as it can be used to scrape content from a web page, which can then be formatted into a Markdown table. No other tools are listed in the 'Available Tools', so only

2024-03-17 09:11:21.856 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.004 | Max budget: $10.000 | Current cost: $0.004, prompt_tokens: 177, completion_tokens: 74
2024-03-17 09:11:21.857 | INFO     | metagpt.tools.tool_recommend:recommend_tools:101 - Recommended tools: 
['scrape_web_playwright']
2024-03-17 09:11:21.858 | INFO     | __main__:_write_code:131 - ready to WriteAnalysisCode


 this tool is recommended.
```python
# Continuing from the previous code, we will now categorize and format the extracted content

# Assuming the extract_financing_flash function has been run and we have the financing_flash_content list available

# Current Task: Categorize and format the extracted content into a Markdown table

# Since we need to categorize the content, we will use a dictionary to hold categories
# For the sake of this example, let's assume we categorize based on the first word after '快讯'
# If there's no clear categorization, we'll use 'Other'

def categorize_content(content_list):
    categorized = {}
    for item in content_list:
        # Split the string and get the first word after '快讯'
        parts = item.split('快讯 ')[1:]
        category = parts[0].split()[0] if parts else 'Other'
        if category not in categorized:
            categorized[category] = []
        categorized[category].append(item)
    return categorized

# Function to convert categorized co

2024-03-17 09:11:36.684 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.083 | Max budget: $10.000 | Current cost: $0.015, prompt_tokens: 665, completion_tokens: 433


 also prints the Markdown table to confirm the output.


2024-03-17 09:11:36.713 | INFO     | __main__:_write_code:131 - ready to WriteAnalysisCode


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 35
     32     return markdown_table
     34 # Categorize the content
---> 35 categorized_content = categorize_content(financing_flash_content)
     37 # Create the Markdown table
     38 markdown_table = create_markdown_table(categorized_content)

NameError: name 'financing_flash_content' is not defined
Apologies for the confusion. Since the `financing_flash_content` is not defined in this notebook session, we need to execute the `extract_financing_flash` function first to get the content. However, since we cannot use `asyncio.run()` in a Jupyter notebook, we need to use `await` within an `async` function to call our async `extract_financing_flash` function.

Here's how you can structure your notebook to first extract the content and then categorize and format it:

```python
import nest_asyncio
nest_asyncio.apply()

# 

2024-03-17 09:11:49.344 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.105 | Max budget: $10.000 | Current cost: $0.021, prompt_tokens: 1142, completion_tokens: 376


 and Markdown table creation as previously described.


2024-03-17 09:11:55.982 | INFO     | metagpt.roles.role:_plan_and_act:492 - ready to take on task task_id='4' dependent_task_ids=['3'] instruction="Save the formatted content into a file named '快讯.md'" task_type='other' code='' result='' is_success=False is_finished=False
2024-03-17 09:11:55.984 | INFO     | metagpt.tools.tool_recommend:recall_tools:195 - Recalled tools: 
['scrape_web_playwright']; Scores: [-7.224208080029695]



```json
["scrape_web_playwright"]
```

The 'scrape_web_playwright' tool can be used to scrape and save the content from a web page, which can then be formatted and saved into a file named '快讯.md'. Since the requirement is to save formatted content, this tool can potentially handle the scraping part, and additional steps may be needed for formatting the content as per

2024-03-17 09:11:59.218 | INFO     | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.004 | Max budget: $10.000 | Current cost: $0.004, prompt_tokens: 177, completion_tokens: 88
2024-03-17 09:11:59.219 | INFO     | metagpt.tools.tool_recommend:recommend_tools:101 - Recommended tools: 
['scrape_web_playwright']
2024-03-17 09:11:59.220 | INFO     | __main__:_write_code:131 - ready to WriteAnalysisCode


 the user's requirement.


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f79340fcd30>


KeyboardInterrupt: 