In [1]:

from llama_index.core.schema import Document 
from pprint import pprint 
doc = Document(
    text="RAG 是一种常见的大模型应用范式，它通过检索—排序—生成的方式生成文本。",
    metadata={'title':'RAG 模型介绍','author':'llama-index'}
) 
pprint(doc.dict())

{'audio_resource': None,
 'class_name': 'Document',
 'embedding': None,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '30cb4d9e-7b36-4b1f-9a58-809b9271404b',
 'image_resource': None,
 'metadata': {'author': 'llama-index', 'title': 'RAG 模型介绍'},
 'metadata_separator': '\n',
 'metadata_template': '{key}: {value}',
 'relationships': {},
 'text': 'RAG 是一种常见的大模型应用范式，它通过检索—排序—生成的方式生成文本。',
 'text_resource': {'embeddings': None,
                   'mimetype': None,
                   'path': None,
                   'text': 'RAG 是一种常见的大模型应用范式，它通过检索—排序—生成的方式生成文本。',
                   'url': None},
 'text_template': '{metadata_str}\n\n{content}',
 'video_resource': None}


## 元数据

In [9]:
# 默认元数据
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir="./data")
docs = reader.load_data()
doc = docs[0]
# 输出所有内容
print("文本内容 (text):\n", doc.text)  # 输出完整文本
print("\n元数据 (metadata):\n", doc.metadata)  # 输出元数据字典
print("\n文档 ID (doc_id):\n", doc.doc_id)  # 输出唯一标识
print("\n关系 (relationships):\n", doc.relationships) 

文本内容 (text):
 AI Agents vs. Agentic AI: A Conceptual
Taxonomy, Applications and Challenges
Ranjan Sapkota∗‡, Konstantinos I. Roumeliotis †, Manoj Karkee ∗‡
∗Cornell University, Department of Biological and Environmental Engineering, USA
†University of the Peloponnese, Department of Informatics and Telecommunications, Tripoli, Greece
‡Corresponding authors: rs2672@cornell.edu, mk2684@cornell.edu
Abstract—This review critically distinguishes between AI
Agents and Agentic AI, offering a structured, conceptual tax-
onomy, application mapping, and analysis of opportunities and
challenges to clarify their divergent design philosophies and
capabilities. We begin by outlining the search strategy and
foundational definitions, characterizing AI Agents as modular
systems driven and enabled by LLMs and LIMs for task-
specific automation. Generative AI is positioned as a precursor
providing the foundation, with AI agents advancing through tool
integration, prompt engineering, and reasoning enhancem

In [2]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

doc4 = Document(
    text="百度是一家中国的搜索引擎公司。",
    metadata={
        "file_name": "test.txt",
        "category": "technology",
        "author": "random person",
    },
    excluded_llm_metadata_keys=["file_name"],
    excluded_embed_metadata_keys=["file_name", "author"],
    metadata_separator=" | ",
    metadata_template="{key}=>{value}",
    text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
)

print("\n全部元数据:\n", doc4.get_content(metadata_mode=MetadataMode.ALL))
print("\n嵌入模型看到的:\n", doc4.get_content(metadata_mode=MetadataMode.EMBED))
print("\n大模型看到的:\n", doc4.get_content(metadata_mode=MetadataMode.LLM))
print("\n没有元数据:\n", doc4.get_content(metadata_mode=MetadataMode.NONE))



全部元数据:
 Metadata: file_name=>test.txt | category=>technology | author=>random person
-----
Content: 百度是一家中国的搜索引擎公司。

嵌入模型看到的:
 Metadata: category=>technology
-----
Content: 百度是一家中国的搜索引擎公司。

大模型看到的:
 Metadata: category=>technology | author=>random person
-----
Content: 百度是一家中国的搜索引擎公司。

没有元数据:
 百度是一家中国的搜索引擎公司。


## 标题提取

In [2]:
# 步骤1：导入模块
from llama_index.core.extractors import TitleExtractor
from llama_index.core.extractors.metadata_extractors import SummaryExtractor
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama

Settings.llm = Ollama(model = "qwen2.5:7b")


documents = SimpleDirectoryReader("data").load_data()

# 步骤2：初始化组件
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=128)  # 文本分块
title_extractor = TitleExtractor(nodes=10)  # 标题抽取器

# 步骤3：构建处理管道
pipeline = IngestionPipeline(
    transformations=[text_splitter, title_extractor]
)

# 步骤4：运行元数据抽取
nodes = pipeline.run(documents=documents, show_progress=False)
pprint(nodes)

100%|██████████| 2/2 [00:00<00:00,  2.34it/s]
100%|██████████| 2/2 [00:01<00:00,  1.30it/s]
100%|██████████| 4/4 [00:01<00:00,  3.05it/s]
100%|██████████| 1/1 [00:00<00:00,  2.15it/s]
100%|██████████| 4/4 [00:01<00:00,  2.70it/s]
100%|██████████| 3/3 [00:00<00:00,  3.52it/s]
100%|██████████| 4/4 [00:01<00:00,  2.87it/s]
100%|██████████| 4/4 [00:01<00:00,  2.93it/s]
100%|██████████| 4/4 [00:01<00:00,  3.64it/s]
100%|██████████| 4/4 [00:01<00:00,  3.87it/s]
100%|██████████| 3/3 [00:01<00:00,  1.76it/s]
100%|██████████| 4/4 [00:01<00:00,  3.59it/s]
100%|██████████| 4/4 [00:01<00:00,  3.22it/s]
100%|██████████| 2/2 [00:00<00:00,  3.16it/s]
100%|██████████| 7/7 [00:03<00:00,  1.81it/s]
100%|██████████| 9/9 [00:07<00:00,  1.24it/s]
100%|██████████| 9/9 [00:04<00:00,  1.95it/s]
100%|██████████| 10/10 [00:07<00:00,  1.29it/s]
100%|██████████| 9/9 [00:06<00:00,  1.40it/s]
100%|██████████| 9/9 [00:06<00:00,  1.42it/s]


NameError: name 'pprint' is not defined

## 总结提取

In [1]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.extractors import SummaryExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama

# 配置LLM（使用本地Ollama模型）
Settings.llm = Ollama(model="qwen2.5:7b")

# 1. 加载文档
documents = SimpleDirectoryReader("data").load_data()

# 2. 初始化处理组件
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=128)
summary_extractor = SummaryExtractor(
    summaries=["self", "prev", "next"],  # 提取当前节点及相邻节点摘要
    prompt_template="请用中文总结以下内容的关键信息:\n{context_str}\n摘要:"
)

# 3. 构建处理管道
pipeline = IngestionPipeline(
    transformations=[
        text_splitter,      # 文本分块
        summary_extractor   # 摘要提取
    ]
)

# 4. 运行处理流程
nodes = pipeline.run(documents=documents, show_progress=False)

# 5. 查看结果示例
for i, node in enumerate(nodes[:3]):  # 打印前3个节点的摘要
    print(f"\n=== 节点 {i} ===")
    print("内容片段:", node.text[:100] + "...")
    print("当前摘要:", node.metadata.get("section_summary", "无"))
    if "prev_section_summary" in node.metadata:
        print("前节点摘要:", node.metadata["prev_section_summary"])
    if "next_section_summary" in node.metadata:
        print("后节点摘要:", node.metadata["next_section_summary"])


100%|██████████| 150/150 [05:59<00:00,  2.40s/it]


=== 节点 0 ===
内容片段: AI Agents vs. Agentic AI: A Conceptual
Taxonomy, Applications and Challenges
Ranjan Sapkota∗‡, Konst...
当前摘要: 该文档《AI代理与有能人工智能：一种概念分类、应用和挑战》由Ranjan Sapkota、Konstantinos I. Roumeliotis 和Manoj Karkee合著。文章详细区分了AI代理和有能人工智能（Agentic AI），并提供了一个结构化的概念分类，包括应用映射和机会与挑战的分析。

文档的关键信息总结如下：
1. **作者及机构**：Ranjan Sapkota、Konstantinos I. Roumeliotis 和Manoj Karkee分别来自康奈尔大学和希腊伯罗奔尼撒大学。
2. **内容概述**：文章从搜索策略和基础定义开始，区分了AI代理（作为模块化系统，由LLM和LIM驱动以实现特定任务自动化）与有能人工智能（涉及多智能体协作、动态任务分解、持久记忆和协调自治）。通过时间线评估架构演变、操作机制、交互方式及自主程度，对比分析两者。此外还探讨了各自面临的独特挑战如幻觉、脆弱性、涌现行为及协调失败，并提出相应解决方案。
3. **应用领域**：AI代理的应用涵盖客户支持、排程和数据摘要；而有能人工智能则应用于研究自动化、机器人协同工作以及医疗决策支持。
4. **关键字**：AI代理、有能人工智能、自主性、推理、情境感知、多智能体系统、概念分类。

文档旨在为开发稳健的、可扩展且可解释的人工智能驱动系统的路线图提供指导。
后节点摘要: 这份文件《AI代理 vs. 生成性AI：概念分类、应用与挑战》主要对比了AI代理和生成性AI，并探讨了它们在研究自动化、机器人协调和医疗决策支持等领域的具体应用。文章还详细分析了每个范式中特有的挑战，如幻觉、脆弱性、涌现行为以及协调失败，并提出了相应的解决方案，例如ReAct循环、检索增强生成（RAG）、自动化协调层和因果建模。

此外，文件通过全球Google搜索趋势图展示了自2022年11月ChatGPT推出以来，“AI代理”和“生成性AI”的兴趣逐渐增加。文章基于早期的研究，提出了如何利用社会行动建模来设计具备人类智能特性的AI代理系统的方




## 读取文件

In [9]:
from llama_index.readers.file import ImageReader

#图片阅读器
image_reader = ImageReader(keep_image=True) 
# image_reader = ImageReader() 

reader = SimpleDirectoryReader(
    input_dir="imgs/",
    file_extractor={".png": image_reader}
)

input_files = reader.load_data()

print(input_files[0].image)

/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAD6AosDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigArnP+Evik1XUNPtNK1G7ksJFineIRBQxUMMbnBPBHaujrgvDP8AyOXjX/sIRf8AohKAN/8A4SO4/wChd1X84P8A47R/wkdx/wBC7qv5wf8Ax2rlFAFP/hI7j/oXdV/OD/47R/wkdx/0Luq/nB/8dq5RQBT/AOEjuP8A