In [2]:
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.ui import Console
from autogen_core.memory import ListMemory, MemoryContent, MemoryMimeType
from autogen_ext.models.openai import OpenAIChatCompletionClient

# Initialize user memory
user_memory = ListMemory()

# Add user preferences to memory
await user_memory.add(MemoryContent(content="The weather should be in metric units", mime_type=MemoryMimeType.TEXT))

await user_memory.add(MemoryContent(content="Meal recipe must be vegan", mime_type=MemoryMimeType.TEXT))


async def get_weather(city: str, units: str = "imperial") -> str:
    if units == "imperial":
        return f"The weather in {city} is 73 °F and Sunny."
    elif units == "metric":
        return f"The weather in {city} is 23 °C and Sunny."
    else:
        return f"Sorry, I don't know the weather in {city}."


assistant_agent = AssistantAgent(
    name="assistant_agent",
    model_client=OpenAIChatCompletionClient(
        model="gpt-4o-2024-08-06",
        api_key="sk-E7gOgfTjf0tREnYXEa1767178b7f43499eBdA49389CdD905",
        base_url="https://api2.road2all.com/v1"
    ),
    tools=[get_weather],
    memory=[user_memory],
)

# Run the agent with a task.
stream = assistant_agent.run_stream(task="What is the weather in New York?")
await Console(stream)



---------- user ----------
What is the weather in New York?
---------- assistant_agent ----------
[MemoryContent(content='The weather should be in metric units', mime_type=<MemoryMimeType.TEXT: 'text/plain'>, metadata=None), MemoryContent(content='Meal recipe must be vegan', mime_type=<MemoryMimeType.TEXT: 'text/plain'>, metadata=None)]
---------- assistant_agent ----------
[FunctionCall(id='call_ifK3oL2CLy58qQ0MHhooCSho', arguments='{"city":"New York","units":"metric"}', name='get_weather')]
---------- assistant_agent ----------
[FunctionExecutionResult(content='The weather in New York is 23 °C and Sunny.', name='get_weather', call_id='call_ifK3oL2CLy58qQ0MHhooCSho', is_error=False)]
---------- assistant_agent ----------
The weather in New York is 23 °C and Sunny.


TaskResult(messages=[TextMessage(source='user', models_usage=None, metadata={}, content='What is the weather in New York?', type='TextMessage'), MemoryQueryEvent(source='assistant_agent', models_usage=None, metadata={}, content=[MemoryContent(content='The weather should be in metric units', mime_type=<MemoryMimeType.TEXT: 'text/plain'>, metadata=None), MemoryContent(content='Meal recipe must be vegan', mime_type=<MemoryMimeType.TEXT: 'text/plain'>, metadata=None)], type='MemoryQueryEvent'), ToolCallRequestEvent(source='assistant_agent', models_usage=RequestUsage(prompt_tokens=123, completion_tokens=20), metadata={}, content=[FunctionCall(id='call_ifK3oL2CLy58qQ0MHhooCSho', arguments='{"city":"New York","units":"metric"}', name='get_weather')], type='ToolCallRequestEvent'), ToolCallExecutionEvent(source='assistant_agent', models_usage=None, metadata={}, content=[FunctionExecutionResult(content='The weather in New York is 23 °C and Sunny.', name='get_weather', call_id='call_ifK3oL2CLy58q

1. 验证split文本结果
2. 验证chain.invoke结果
3. 验证map_prompt的输出是否为有效的JSON

In [7]:
import os
import asyncio
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_openai import ChatOpenAI
from autogen_agentchat.ui import Console
from autogen_core.memory import ListMemory, MemoryContent, MemoryMimeType
from autogen_agentchat.agents import AssistantAgent, UserProxyAgent
from autogen_ext.models.openai import OpenAIChatCompletionClient
import re
import nest_asyncio

nest_asyncio.apply()
load_dotenv()

class ChapterAwareSplitter:
    def __init__(self, chunk_size=2000):
        self.chunk_size = chunk_size
        self.chapter_pattern = re.compile(
            r'(^#+\s.+)|(^第[一二三四五六七八九十]+章\s.+)|(^\d+\.\d+\s.+)', 
            re.MULTILINE
        )

    def split_text(self, text):
        chapters = []
        buffer = ""
        
        for match in self.chapter_pattern.finditer(text):
            start_pos = match.start()
            
            if len(buffer) + start_pos > self.chunk_size and buffer:
                chapters.append(buffer)
                buffer = ""
                
            buffer += text[:start_pos]
            text = text[start_pos:]
            
            chapter_text = match.group()
            if len(buffer) + len(chapter_text) > self.chunk_size and buffer:
                chapters.append(buffer)
                buffer = ""
            buffer += chapter_text
        
        if text:  
            buffer += text
            
        if buffer:
            chapters.append(buffer)
            
        return chapters

class PDFKnowledgeProcessor:
    def __init__(self, pdf_path, map_template, reduce_template):
        self.pdf_path = pdf_path
        self.map_template = map_template
        self.reduce_template = reduce_template
        self.llm = ChatOpenAI(
            temperature=0,
            model="gpt-4o-2024-08-06",
            base_url="https://api2.road2all.com/v1",
            api_key=os.getenv("OPENAI_API_KEY")
        )

    def process_pdf(self):
        from langchain_core.documents import Document
        try:
            loader = PyPDFLoader(self.pdf_path)
            pages = loader.load()
            full_text = "\n".join([page.page_content for page in pages])
        except Exception as e:
            print(f"PDF加载失败: {str(e)}")
            return ""

        splitter = ChapterAwareSplitter()
        chunks = splitter.split_text(full_text)
        print(f"成功分割为 {len(chunks)} 个章节块")

        # 空文档过滤
        valid_chunks = []
        for chunk in chunks:
            if len(chunk.strip()) < 50:  # 过滤空内容
                continue
            if "未定义" in chunk or "待补充" in chunk:  # 过滤占位内容
                continue
            valid_chunks.append(chunk)
        
        if not valid_chunks:
            print("无有效内容需要处理")
            return ""

        documents = [
            Document(
                page_content=chunk,
                metadata={
                    "source": self.pdf_path,
                    "length": len(chunk)
                    }
            ) for chunk in valid_chunks
        ]
        
        #langchain配置
        try:
            #map
            map_prompt = PromptTemplate.from_template(self.map_template)
            map_chain = LLMChain(
                llm=self.llm, 
                prompt=map_prompt,
                verbose=True
            )

            #reduce
            reduce_prompt = PromptTemplate.from_template(self.reduce_template)
            reduce_chain = LLMChain(
                llm=self.llm,
                prompt=reduce_prompt,
                verbose=True
            )
            combine_documents_chain = StuffDocumentsChain(
                llm_chain=reduce_chain,
                document_variable_name="docs"
            )

            # Combines and iteratively reduces the mapped documents
            reduce_documents_chain = ReduceDocumentsChain(
                combine_documents_chain=combine_documents_chain,
                collapse_documents_chain=combine_documents_chain,
                token_max=4000
            )
            
            # Combining documents by mapping a chain over them, then combining results
            map_reduce_chain = MapReduceDocumentsChain(
                llm_chain=map_chain,
                reduce_documents_chain=reduce_documents_chain,
                document_variable_name="docs",
                return_intermediate_steps=False
            )

            # 调用链并捕获异常
            try:
                result = map_reduce_chain.invoke({"input_documents": documents}, return_intermediate_steps=False)
                return result['output_text']
            except Exception as e:
                print(f"链调用失败: {str(e)}")
                return ""
        except Exception as e:
            print(f"摘要生成失败: {str(e)}")
            return ""

async def main():
    PDF_PATH = "/home/www/AgentFlow/agentflow/docs/GalsimToolkit.pdf"
    map_template = """
    请仔细阅读以下文本内容，并回答以下问题（用中文，保持简洁）：
    如果内容中没有涉及以下任何一项，请直接回复“无意义内容，无需总结”：
    1. 技术方法/算法描述
    2. 代码模块/组件的结构说明
    3. 功能实现的关键步骤或流程
    4. 项目整体架构或作用说明

    如果内容有实际技术信息，请参考以下内容总结：
    1. "技术方法": "...",
    2. "模块结构": "...",
    3. "关键步骤": "...",
    4. "作用说明": "..."
    文本内容：
    {docs}
    """

    reduce_template = """
    请根据以下分块摘要，生成项目的技术概述文档。要求：
    1. **技术方法**：总结项目的核心技术、算法或框架。
    2. **架构设计**：描述代码的整体架构和模块间关系。
    3. **功能模块**：列举主要功能模块及其作用。
    4. **实现细节**：说明关键步骤或技术难点的解决方案。
    5. **应用场景**：项目适用的使用场景或目标用户。
    6. **优势特点**：项目的独特优势或创新点。

    请用结构化的方式呈现，分点说明，避免冗余。

    
    分块摘要：
    {docs}
    """

    processor = PDFKnowledgeProcessor(
        pdf_path=PDF_PATH,
        map_template=map_template,
        reduce_template=reduce_template
    )
    knowledge = processor.process_pdf()
    
    if knowledge:
        print("\n生成的知识库内容：\n", knowledge)
    else:
        print("知识库生成失败，请检查日志")
        return

    memory = ListMemory()
    await memory.add(MemoryContent(
        content=knowledge,
        mime_type=MemoryMimeType.TEXT))

    model_client = OpenAIChatCompletionClient(
        model="gpt-4o-2024-08-06",
        base_url="https://api2.road2all.com/v1",
        api_key=os.getenv("OPENAI_API_KEY")
    )

    assistant_agent = AssistantAgent(
        name="project_assistant",
        system_message="你是一个项目助手，回答时请参考以下知识：\n{{知识库}}",
        model_client=model_client,
        memory=[memory]
    )

    stream = assistant_agent.run_stream(task="请根据文档说明项目使用的主要技术方法。")
    await Console(stream)

In [8]:
await main()

成功分割为 22 个章节块


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    请仔细阅读以下文本内容，并回答以下问题（用中文，保持简洁）：
    如果内容中没有涉及以下任何一项，请直接回复“无意义内容，无需总结”：
    1. 技术方法/算法描述
    2. 代码模块/组件的结构说明
    3. 功能实现的关键步骤或流程
    4. 项目整体架构或作用说明

    如果内容有实际技术信息，请参考以下内容总结：
    1. "技术方法": "...",
    2. "模块结构": "...",
    3. "关键步骤": "...",
    4. "作用说明": "..."
    文本内容：
    arXiv:1407.7676v3  [astro-ph.IM]  15 Feb 2015
G AL SIM : The modular galaxy image simulation toolkit
Barnaby Rowea,b,c,∗, Mike Jarvisd,∗, Rachel Mandelbaume,∗, Gary M. Bernsteind, James Boschf, Melanie Simete,
Joshua E. Meyersg, Tomasz Kacprzaka,h, Reiko Nakajimai, Joe Zuntzh, Hironao Miyatakef,j, Jörg P. Dietrichk,l, Robert Armstrongf,
Peter Melchiorm , Mandeep S. S. Gilln
aDepartment of Physics & Astronomy, University College London, Gower Street, London, WC1E 6BT, United Kingdom
bJet Propulsion Laboratory, California Institute of Technology, 4800 Oak Grove Drive, Pasadena, CA 91109, United States of America
cCalifo