In [1]:
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage, QueryBundle, Document, get_response_synthesizer
from llama_index.core.schema import MetadataMode, IndexNode, TextNode
from llama_index.core.node_parser import SimpleNodeParser 
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.callbacks import LlamaDebugHandler, CallbackManager
from llama_index.core.retrievers import RecursiveRetriever
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.llms.huggingface import (
    HuggingFaceInferenceAPI,
    HuggingFaceLLM,
)
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)
from fastapi.encoders import jsonable_encoder
from custom_llm import MyCustomLLM
import json


  from .autonotebook import tqdm as notebook_tqdm



In [2]:
embedding_dir = "electronic_small_and_big_emb"
checkpoint_path = "/mnt/resource/public_models/Llama-2-7b-chat-hf"

# set tokenizer to match LLM
Settings.tokenizer = AutoTokenizer.from_pretrained(
    checkpoint_path,
    torch_dtype="auto",
    trust_remote_code=True
)

Settings.llm = MyCustomLLM()

# set the embed model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="/mnt/jinweilin/package/bge-small-en-v1.5"
)

# 使用LlamaDebugHandler构建事件回溯器，以追踪LlamaIndex执行过程中发生的事件
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])
Settings.callback_manager = callback_manager

In [3]:
def prepare_node():
    # 读取文档
    with open("../electronic_content_duplication.jsonl","r") as f:
        documents = []
        for line in f.readlines():
            # json: product_name, content
            json_item = json.loads(line)
            document = Document(
                text=json_item['content'],
                metadata={
                    "file_name": json_item['product_name'],
                    "product_name": json_item['product_name']
                }
            )
            documents.append(document)

    # 创建大块
    node_parser = SimpleNodeParser.from_defaults(chunk_size=2048)
    base_nodes = node_parser.get_nodes_from_documents(documents)
    # set node ids to be a constant
    for idx, node in enumerate(base_nodes):
        node.id_ = f"node-{idx}"


    sub_chunk_sizes = [256, 512, 1024]
    sub_node_parsers = [
        SimpleNodeParser.from_defaults(chunk_size=c, chunk_overlap=0) for c in sub_chunk_sizes
    ]

    all_nodes = []
    index = 0
    for base_node in base_nodes:
        for n in sub_node_parsers:
            sub_nodes = n.get_nodes_from_documents([base_node])
            sub_inodes = [
                IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
            ]
            all_nodes.extend(sub_inodes)
            if index == 0:
                print(base_node)
                print(sub_nodes)
                print(sub_inodes)

        index +=1
        # also add original node to node
        original_node = IndexNode.from_text_node(base_node, base_node.node_id)
        all_nodes.append(original_node)

    all_nodes_dict = {n.node_id: n for n in all_nodes}

    return all_nodes_dict, all_nodes

In [4]:
def save_index(all_nodes, dir):
    # 切分文档，建立索引
    vector_index_chunk = VectorStoreIndex(
        all_nodes
    )

    # 存储文档embedding
    vector_index_chunk.storage_context.persist(persist_dir=dir)

def read_index(dir):
# 读取文档embedding
    storage_context = StorageContext.from_defaults(persist_dir=dir)
    index = load_index_from_storage(storage_context)
    return index

In [5]:
all_nodes_dict, all_nodes = prepare_node()
# save_index(all_nodes, embedding_dir)
vector_index_chunk = read_index(embedding_dir)

Node ID: node-0
Text: SGM8040S-1  550nA, Rail-to-Rail I/O,  High Precision Operational
Amplifier      NOVEMBER 2018 – REV. A SG Micro Corp  www.sg-micro.com
GENERAL DESCRIPTION  The SGM8040S-1 is a single, high precision
operational  amplifier which can operate from 1.4V to 5.5V single
supply, while consuming only 550nA quiescent current.  It is capable
of rail-to-...
[TextNode(id_='fd6472aa-392a-4395-85b3-8816689db269', embedding=None, metadata={'file_name': 'SGM8040S-1', 'product_name': 'SGM8040S-1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='node-0', node_type=<ObjectType.TEXT: '1'>, metadata={'file_name': 'SGM8040S-1', 'product_name': 'SGM8040S-1'}, hash='8f2d0c1e38b736ad1fd3351cd504a9e349e43d3e1f377e88de954415dd3458ef'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='f27df1a0-fedd-440d-915f-6d05dc41f4de', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='6a853eb62a354b43e73fcdb

In [6]:
question = "SGM5200的上电时间是多久?"

# 使用不同的检索和查询方法
filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="product_name", operator=FilterOperator.EQ, value="SGM5200"
        )
    ]
)

import time 

start_time =time.time() 
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=5, filters=filters)
nodes = vector_retriever_chunk.retrieve(question)
end_time = time.time()
print(end_time - start_time)


for node in nodes:
    print(node)

print("=================================================")

retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_chunk},
    node_dict=all_nodes_dict,
    verbose=True,
)

nodes = retriever_chunk.retrieve(question)
for node in nodes:
    print(node)

# 构建response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode=ResponseMode.REFINE
)

**********
Trace: query
    |_retrieve -> 0.71229 seconds
      |_embedding -> 0.590486 seconds
**********
0.7206461429595947
Node ID: 54c93638-2f72-47d8-8e5d-9fb54e99044d
Text: 12  6  CH14  AI  13  7  CH13  AI  14  8  CH12  AI  15  9  CH11
AI  16  10  CH10  AI  17  11  CH9  AI  18  12  CH8  AI  21  13  CH7
AI  22  14  CH6  AI  23  15  CH5  AI  24  16  CH4  AI  25  17  CH3  AI
26  18  CH2  AI  27  19  CH1  AI  28  20  CH0  AI  31  23  nCS  DI
Chip Select. Active low.  32  24  SCLK  DI  Serial Clock Input.  33
25  S...
Score:  0.811

Node ID: bd36ad56-72c5-4516-9679-023208e5efac
Text: 12  6  CH14  AI  13  7  CH13  AI  14  8  CH12  AI  15  9  CH11
AI  16  10  CH10  AI  17  11  CH9  AI  18  12  CH8  AI  21  13  CH7
AI  22  14  CH6  AI  23  15  CH5  AI  24  16  CH4  AI  25  17  CH3  AI
26  18  CH2  AI  27  19  CH1  AI  28  20  CH0  AI  31  23  nCS  DI
Chip Select. Active low.  32  24  SCLK  DI  Serial Clock Input.
Score:  0.809

Node ID: e6eb3b67-f080-4f8b-9fdf-31205e5eefff
Text: 7 -1.65 -

In [12]:
query_engine_chunk = RetrieverQueryEngine(
    # retriever=retriever_chunk
    retriever=vector_retriever_chunk
)

response = query_engine_chunk.query(question)
print(str(response))

**********
Trace: query
    |_query -> 2.379881 seconds
      |_retrieve -> 0.124493 seconds
        |_embedding -> 0.02331 seconds
      |_synthesize -> 2.254097 seconds
        |_templating -> 2e-05 seconds
        |_llm -> 2.246279 seconds
**********
The provided context information does not contain details about the power-up time or any specific information regarding the time it takes for the SGM5200 to initialize after being powered on. Therefore, based on the given data, I cannot answer the query about the SGM5200's power-up time.


In [9]:
with open("node_info.txt","w") as f:
    f.write(json.dumps(jsonable_encoder(all_nodes_dict["node-2507"]))+"\n")
    f.write(json.dumps(jsonable_encoder(all_nodes_dict["node-2502"]))+"\n")
    f.write(json.dumps(jsonable_encoder(all_nodes_dict["node-2501"]))+"\n")