In [1]:
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage, QueryBundle
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.llms.huggingface import (
    HuggingFaceInferenceAPI,
    HuggingFaceLLM,
)

  from .autonotebook import tqdm as notebook_tqdm



In [2]:
checkpoint_path = "/mnt/resource/public_models/Llama-2-7b-chat-hf"

In [None]:
# 构建自定义prompt
from llama_index.core import PromptTemplate

# 定义system prompt
SYSTEM_PROMPT = """你是一个医疗人工智能助手。"""
query_wrapper_prompt = PromptTemplate(
    "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
)

# 定义qa prompt
qa_prompt_tmpl_str = (
    "上下文信息如下。\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "请根据上下文信息而不是先验知识来回答以下的查询。"
    "作为一个医疗人工智能助手，你的回答要尽可能严谨。\n"
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

# 定义refine prompt
refine_prompt_tmpl_str = (
    "原始查询如下：{query_str}"
    "我们提供了现有答案：{existing_answer}"
    "我们有机会通过下面的更多上下文来完善现有答案（仅在需要时）。"
    "------------"
    "{context_msg}"
    "------------"
    "考虑到新的上下文，优化原始答案以更好地回答查询。 如果上下文没有用，请返回原始答案。"
    "Refined Answer:"
)
refine_prompt_tmpl = PromptTemplate(refine_prompt_tmpl_str)

In [3]:
# set tokenizer to match LLM
Settings.tokenizer = AutoTokenizer.from_pretrained(
    checkpoint_path,
    torch_dtype="auto",
    trust_remote_code=True
)

In [None]:
# set the LLM
Settings.llm = HuggingFaceLLM(
        model_name = checkpoint_path,
        tokenizer_name = checkpoint_path 
    )

In [None]:
# 使用llama-index创建本地大模型
Settings.llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=checkpoint_path,
    model_name=checkpoint_path,
    device_map="auto",
    model_kwargs={"torch_dtype": "auto"},
)

In [4]:
# 本地模型api
from custom_llm import MyCustomLLM
Settings.llm = MyCustomLLM()

In [5]:
# set the embed model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="/mnt/jinweilin/package/bge-small-en-v1.5"
)

In [6]:
from llama_index.core.callbacks import LlamaDebugHandler, CallbackManager

# 使用LlamaDebugHandler构建事件回溯器，以追踪LlamaIndex执行过程中发生的事件
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])
Settings.callback_manager = callback_manager

In [None]:
# 读取文档
documents = SimpleDirectoryReader("data").load_data()
# 切分文档，建立索引
index = VectorStoreIndex.from_documents(
    documents,
)

In [None]:
# 存储文档embedding
index.storage_context.persist(persist_dir='doc_emb')

In [7]:
# 读取文档embedding
storage_context = StorageContext.from_defaults(persist_dir="doc_emb")
index = load_index_from_storage(storage_context)

**********
Trace: index_construction
**********


In [None]:
# 使用不同的检索和查询方法
from llama_index.core import get_response_synthesizer
from llama_index.core.indices.vector_store import VectorIndexRetriever
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import ResponseMode

# 构建retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=5,
)

# 构建response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode=ResponseMode.REFINE
)

# 构建查询引擎
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.6)],
)

In [8]:
query_engine = index.as_query_engine()

In [None]:
# 更新查询引擎中的prompt template
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl,
     "response_synthesizer:refine_template": refine_prompt_tmpl}
)

In [9]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

individual_query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name=f"electronic_search",
            description=(
                "Use this tool when querying electronic components"
            )
        ),
    )
]

In [10]:
# 问题分解为子问题
from llama_index.core.query_engine import SubQuestionQueryEngine

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=individual_query_engine_tools,
)

In [11]:
response = query_engine.query(
    "What are the packaging dimensions of SGM42600? And please give me output current of SGM42600."
)
print(response)

**********
Trace: query
    |_query -> 1.075729 seconds
      |_templating -> 6.4e-05 seconds
      |_llm -> 0.0 seconds
**********


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# 输出formatted_prompt
event_pairs = llama_debug.get_llm_inputs_outputs()
print(event_pairs[0][1].payload["formatted_prompt"])

In [None]:
# 检索相似度前五的片段
contexts = query_engine.retrieve(QueryBundle("What are the packaging dimensions of SGM42600?"))
print('-'*10 + 'ref' + '-'*10)
for i, context in enumerate(contexts):
    print('*'*10 + f'chunk {i} start' + '*'*10)
    content = context.node.get_content(metadata_mode=MetadataMode.LLM)
    print(content)
    print('*' * 10 + f'chunk {i} end' + '*' * 10)
print('-'*10 + 'ref' + '-'*10)