已连接到 .env (Python 3.10.16)


In [1]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings

# 使用本地部署的ollama模型
llm = Ollama(model="llama3.1", request_timeout=60.0)
Settings.llm = llm


embed_model = OllamaEmbedding(model_name="bge-m3")
Settings.embed_model = embed_model

In [2]:
# 加载数据

# 数据获取通常包含三个主要阶段

# 1. 加载数据
# 2. 转换数据
# 3.索引并存储数据


# 加载数据

from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("../data/pdf/").load_data()

print(documents)

[Document(id_='e32c6556-cc48-4138-b1bf-c54dece576d7', embedding=None, metadata={'page_label': '1', 'file_name': '4f63296c.pdf', 'file_path': '/Users/wangzhongjie/workspace/github-projects/llamaindex-learning/notebooks/../data/pdf/4f63296c.pdf', 'file_type': 'application/pdf', 'file_size': 427665, 'creation_date': '2025-03-22', 'last_modified_date': '2025-03-22'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='demo \n01\ue088\nWelcome to the digital twin demonstration presentation of SoftBank Data \nCenter.\nFirstly, we begin with the case demonstration session. Through the convenient \nscene switching function, you will be able to experie

In [3]:
from llama_index.core import VectorStoreIndex

# indexes 有一个 .from_documents() 方法，它接受 Document 对象数组，并会正确解析和分块它们
vector_index = VectorStoreIndex.from_documents(documents)
vector_index.as_query_engine()

print(vector_index)

<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x1158612d0>


In [4]:
# 自定义核心组件，比如文本分割器，可以通过这个抽象层传递一个自定义的 transformations 列表或应用到全局 Settings
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=10)

Settings.node_parser = text_splitter

index = VectorStoreIndex.from_documents(documents,transformations=[text_splitter])

print(index)



<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x115862890>


In [11]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

from pprint import pprint
import json

documents = SimpleDirectoryReader("../data/md/").load_data()

vector_index = VectorStoreIndex.from_documents(documents)
vector_index.as_query_engine()

pipeline = IngestionPipeline(transformations=[TokenTextSplitter()])

nodes = pipeline.run(documents=documents)

# # pprint(nodes)
# print(json.dumps(nodes, indent=2, ensure_ascii=False))


# 使用更好的格式化输出
def format_nodes(nodes):
    for i, node in enumerate(nodes, 1):
        print(f"\n=== Node {i} ===")
        print(f"ID: {node.id_}")
        print(
            f"Text: {node.text[:100]}..."
            if len(node.text) > 100
            else f"Text: {node.text}"
        )
        print(f"Metadata: {json.dumps(node.metadata, indent=2, ensure_ascii=False)}")
        print("=" * 50)


format_nodes(nodes)


=== Node 1 ===
ID: 2f7f5af2-9f94-46c9-bacb-ca13391bc1ab
Text: ## Python学习资源汇总

最近有很多小伙伴在找 Python 的相关学习资源，给大家做一个汇总吧，大家就不需要到处打听了，而且网上的资源良莠不齐，给大家整理一些优质的资源，让大家少走弯路。温馨...
Metadata: {
  "file_path": "/Users/wangzhongjie/workspace/github-projects/llamaindex-learning/notebooks/../data/md/python.md",
  "file_name": "python.md",
  "file_size": 9362,
  "creation_date": "2025-03-22",
  "last_modified_date": "2025-03-22"
}

=== Node 2 ===
ID: 4d28d716-ee65-441b-ad70-25e93c38055e
Text: [《PyTorch: Deep Learning and Artificial Intelligence》](https://www.udemy.com/course/pytorch-deep-lea...
Metadata: {
  "file_path": "/Users/wangzhongjie/workspace/github-projects/llamaindex-learning/notebooks/../data/md/python.md",
  "file_name": "python.md",
  "file_size": 9362,
  "creation_date": "2025-03-22",
  "last_modified_date": "2025-03-22"
}

=== Node 3 ===
ID: 98ec21d4-9539-4245-8d4e-9192a820036f
Text: - 在线学习平台
15. [DeepLearning.ai](https://www.deeplearning.ai/) - 吴恩达（Andrew Ng）老师创办的深度学习教育平台
16. [力扣](...
Met