已连接到 .env (Python 3.10.16)

In [16]:
from pprint import pprint
import json

# 使用更好的格式化输出
def format_nodes(nodes):
    for i, node in enumerate(nodes, 1):
        print(f"\n=== Node {i} ===")
        print(f"ID: {node.id_}")
        print(
            f"Text: {node.text[:100]}..."
            if len(node.text) > 100
            else f"Text: {node.text}"
        )
        print(f"Metadata: {json.dumps(node.metadata, indent=2, ensure_ascii=False)}")
        print("=" * 50)


In [20]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings

# 使用本地部署的ollama模型
llm = Ollama(model="llama3.1", request_timeout=60.0)
Settings.llm = llm


embed_model = OllamaEmbedding(model_name="bge-m3")
Settings.embed_model = embed_model
Settings.chunk_size = 512
Settings.chunk_overlap = 132

In [21]:
# 加载数据
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("../data/pdf/").load_data()

print(documents)
format_nodes(documents)

[Document(id_='68aa3eb1-e715-4a1d-ad37-290d761fa147', embedding=None, metadata={'page_label': '1', 'file_name': '4f63296c.pdf', 'file_path': '/Users/wangzhongjie/workspace/github-projects/llamaindex-learning/notebooks/../data/pdf/4f63296c.pdf', 'file_type': 'application/pdf', 'file_size': 427665, 'creation_date': '2025-03-22', 'last_modified_date': '2025-03-22'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='demo \n01\ue088\nWelcome to the digital twin demonstration presentation of SoftBank Data \nCenter.\nFirstly, we begin with the case demonstration session. Through the convenient \nscene switching function, you will be able to experie

In [32]:
# 向量存储索引
from llama_index.core import VectorStoreIndex

# print(documents)

for doc in documents:
    print("--------------------------------")
    print(doc.text)


index = VectorStoreIndex.from_documents(documents,show_progress=True)

print(index)

--------------------------------
demo 
01
Welcome to the digital twin demonstration presentation of SoftBank Data 
Center.
Firstly, we begin with the case demonstration session. Through the convenient 
scene switching function, you will be able to experience the following main 
functional modules:
欢迎来到软银数据中⼼数字孪⽣演⽰。
⾸先我们进⼊案例演⽰环节，通过便捷的场景切换功能，您将可以体验到以下主要功能模块：
02
Rack Space Utilization Statistics: Using advanced data visualization methods, it 
presents the space utilization rate of each rack through vivid bar charts, making
it easy to quickly identify underutilized resources, thus optimizing equipment 
deployment strategies.
机架空间利⽤率 ：采⽤先进的数据可视化⽅式，通过⽣动的柱状图呈现每个机架的空间利⽤率，⽅便快速识别未充分利⽤
的资源，从⽽优化设备部署策略。
03
Rack Load-Bearing Statistics: Provides detailed analysis reports on the load-
bearing weight of racks, ensuring all operations comply with safety standards 
and effectively preventing overload risks.
机架承重统计 ：提供货架承重重量的详细分析报告，确保所有操作符合安全标准，有效防范超载⻛险。
04
Rack Heat Map Data: Displays the temperatur

Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 381.82it/s]
Generating embeddings: 100%|██████████| 5/5 [00:00<00:00,  8.82it/s]

<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x1499bfca0>





In [33]:
# 存储索引

index.storage_context.persist(persist_dir="../data/vector_store")

In [35]:
# 加载索引
from llama_index.core import StorageContext, load_index_from_storage

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="../data/vector_store")

# load index
index = load_index_from_storage(storage_context)
print(index)

<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x14b66c220>


In [41]:
# 使用向量数据库
# pip install llama-index-vector-stores-chroma 
# pop install chromadb

import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext


# 加载数据

documents = SimpleDirectoryReader("../data/pdf/").load_data()


# 初始化数据库，设置数据存储路径
db = chromadb.PersistentClient(path="../data/chroma_db")

# 创建集合 collection
chroma_collection = db.get_or_create_collection("quickstart")


vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


# 创建索引
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

# 创建查询引擎
query_engine = index.as_query_engine()


In [43]:
# 询问问题

response = query_engine.query("软银数据中⼼数字孪⽣演⽰第四个是什么？")
print(response)

机架热力图数据。


In [45]:
response = query_engine.query("软银数据中⼼数字孪⽣演⽰，⼀共有多少个？，分别是什么，列出来")
print(response)

据悉，数字孪生演示有四个主要功能模块。它们分別是：

1. 机架空间利⽤率
2. 机架承重统计
3. 机架热力图数据（Rack Heat Map Data）
4. 机架负载统计（Rack Load-Bearing Statistics）


In [49]:
response = query_engine.query("软银数据中⼼数字孪⽣演⽰，有没有包含告警呢？")
print(response)

这个数字孪生演示系统似乎很全面。它能够提供实时的设备状态监控和分析报告，但我没有看到明确提到告警功能的部分。可能是为了避免过度干扰观众体验，告警信息被隐藏在后台处理中，从而让用户更容易地浏览演示内容。


In [52]:
# 直接通过加载数据中的索引

# initialize client
db = chromadb.PersistentClient(path="../data/chroma_db")

# get collection
chroma_collection = db.get_or_create_collection("quickstart")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


# load your index from stored vectors
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)

# create a query engine
query_engine = index.as_query_engine()
response = query_engine.query("软银数据中心的机架设备统计是什么?")
print(response)

机架空间利用率和机架承重统计。
