# 倒排索引

In [None]:
# load embedding model
from langchain_openai import OpenAIEmbeddings
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k) 
            #倒排排名融合核心: 更新文档的融合得分。对于每个文档，按其排名来调整分数。越排前的文档（rank 较小）会得到越高的分数，评分根据排名的倒数来递减（即 1 / (rank + k)）。

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    # 重新排序并构建结果列表。首先，按文档的融合得分排序（从高到低），然后将每个文档从字符串形式反序列化回原始文档对象。
    return reranked_results

embeddings = OpenAIEmbeddings(

)

# load data
from langchain.document_loaders import CSVLoader
loader = CSVLoader("../data/cve.csv")
documents = loader.load()

# split documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
documents = text_splitter.split_documents(documents)

from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(documents, embeddings)

# create retriever
retriever = vectorstore.as_retriever()

# create llm
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(

)

# create chain 使用 Langsmith 客户端从模板中拉取查询生成的提示（prompt），然后通过语言模型生成相关的查询
from langchain_core.output_parsers import StrOutputParser
from langsmith import Client
client = Client()
prompt = client.pull_prompt("langchain-ai/rag-fusion-query-generation")

# generate queries
generate_queries = (
    prompt | llm | StrOutputParser() | (lambda x: x.split("\n"))
)

# rerank results
# 倒排排名融合 算法，它将多个搜索引擎或查询结果列表中的文档进行融合，计算每个文档的融合得分，最终返回按得分排序的文档。通过这种方式，可以将不同查询来源的搜索结果综合在一起，提高结果的相关性和多样性。倒排排名融合的优势在于它能够根据多个来源的结果进行加权融合，从而减少单一搜索引擎可能存在的偏差。


# create chain 将查询生成链、检索器、倒排排名融合函数结合起来，形成一个完整的查询链。这个链首先生成查询，接着用检索器进行查询并返回结果，最后通过倒排排名融合算法重新排序这些结果。
chain = generate_queries | retriever.map() | reciprocal_rank_fusion

# check input schema
chain.input_schema.schema()

# rerank results
chain.invoke("what is CVE-2013-3900")

from langchain.schema.runnable import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context.
If you don't find the answer in the context, just say that you don't know.

Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

rag_fusion_chain = (
    {
        "context": chain,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

rag_fusion_chain.invoke("what is CVE-2013-3900")

In [None]:
# load embedding model
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(


# load data
from langchain.document_loaders import CSVLoader
loader = CSVLoader("../data/cve.csv")
documents = loader.load()

# split documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
documents = text_splitter.split_documents(documents)

# create vectorstore
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(documents, embeddings)

# create retriever
retriever = vectorstore.as_retriever()

# create keyword retriever
from langchain.retrievers import BM25Retriever
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3

# test keyword retriever
keyword_retriever.get_relevant_documents("tell me about CVE-2013-3900")

# create ensemble retriever 检索组合
from langchain.retrievers import EnsembleRetriever
ensemble_retriever = EnsembleRetriever(retrievers=[retriever, keyword_retriever], weights=[0.5, 0.5]) #标准检索和BM25检索各占据0.5

# test ensemble retriever
ensemble_retriever.get_relevant_documents("tell me about CVE-2013-3900")

# create llm
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(

)

# create document chain
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

template = """"
You are a helpful assistant that answers questions based on the following context.
If you don't find the answer in the context, just say that you don't know.
Context: {context}

Question: {input}

Answer:

"""
prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": ensemble_retriever,  "input": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# response
response = rag_chain.invoke('what is CVE-2013-3900')
response

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langsmith import Client
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

# 1. 设置 Langsmith 客户端和查询生成
client = Client()
prompt = client.pull_prompt("langchain-ai/rag-fusion-query-generation")

# 创建查询生成链
generate_queries = (
    prompt | llm | StrOutputParser() | (lambda x: x.split("\n"))
)

# 2. 创建倒排排名融合（Reciprocal Rank Fusion）算法
def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k) 
            #倒排排名融合核心: 更新文档的融合得分。对于每个文档，按其排名来调整分数。越排前的文档（rank 较小）会得到越高的分数，评分根据排名的倒数来递减（即 1 / (rank + k)）。

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    # 重新排序并构建结果列表。首先，按文档的融合得分排序（从高到低），然后将每个文档从字符串形式反序列化回原始文档对象。
    return reranked_results

# 3. 创建检索器和组合检索器
vectorstore = FAISS.from_documents(documents, embeddings)
retriever = vectorstore.as_retriever()
keyword_retriever = BM25Retriever.from_documents(documents)
ensemble_retriever = EnsembleRetriever(retrievers=[retriever, keyword_retriever], weights=[0.5, 0.5])

# 4. 创建查询链和重新排序
chain = generate_queries | ensemble_retriever.map() | reciprocal_rank_fusion

# 5. 创建一个基于上下文的生成模型
template = """Answer the question based only on the following context.
If you don't find the answer in the context, just say that you don't know.

Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# 最终结合生成查询、检索和倒排排名融合的管道
rag_fusion_chain = (
    {
        "context": chain,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

# 6. 测试查询管道
response = rag_fusion_chain.invoke("what is CVE-2013-3900")
print(response)
