In [1]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import ChatOpenAI
from typing import List
from typing import Literal, Optional, Tuple
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        loads(doc)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

class Soybean_Q_3(BaseModel):

    question1: str = Field(
        ..., description="Given contextual information, not prior knowledge. Generate one question based only on the following queries.(Question1)"
    )
    question2: str = Field(
        ..., description="Given contextual information, not prior knowledge. Generate one question based only on the following queries.(Question2)"
    )
    question3: str = Field(
        ..., description="Given contextual information, not prior knowledge. Generate one question based only on the following queries.(Question3)"
    )

# Set up a parser
parser = PydanticOutputParser(pydantic_object=Soybean_Q_3)

from langchain import PromptTemplate
# Prompt
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user query. Wrap the output in `json` tags\n{format_instructions}",
        ),
        (
            "human", 
            """
            You are a helpful assistant that generates multiple search queries based on a single input query. \n
            Generate multiple search queries related to:{query}.
            """
        ),
    ]
).partial(format_instructions=parser.get_format_instructions())

llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)
rag_chain = (
    {"query": RunnablePassthrough()} 
    | prompt 
    | llm
    | parser 
)

  warn_deprecated(


In [2]:
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import pandas as pd
import json
from tqdm import tqdm, trange
from retiever_eval_list import get_result_retrieva, get_retriever_res_list
import os
import shutil
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

df = pd.read_excel('soybean_q_gt_609.xlsx', sheet_name='Sheet1')

# 修改
folder_path = 'Result/a_rag_fusion/'
retriever_filename = "Result/a_rag_fusion/retriever_result.json"
save_info_result_filename = "Result/a_rag_fusion/save_info_result.json"
top_k = 5
s_index = 3

model_name = '/mnt/workspace/.cache/modelscope/hub/maple77/zpoint_large_embedding_zh'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
vectorstore = Chroma(persist_directory="soybean_db2", embedding_function=hf)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": top_k}
)

tokenizer = AutoTokenizer.from_pretrained('/mnt/workspace/.cache/modelscope/hub/Xorbits/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('/mnt/workspace/.cache/modelscope/hub/Xorbits/bge-reranker-base')


def create_folder_if_not_exists(folder_path):
    # 检查文件夹是否存在
    if os.path.exists(folder_path):
        # 如果存在，则删除原文件夹及其中内容
        shutil.rmtree(folder_path)
        print(f"Folder '{folder_path}' existed and has been removed.")
    # 创建新文件夹
    os.makedirs(folder_path)
    print(f"Folder '{folder_path}' has been created.")

# 打印DataFrame的内容
column_lists = {col: df[col].tolist() for col in df.columns}
print(column_lists.keys())

dict_keys(['id', 'source', 'page', 'question', 'ground_truth', 'context'])


In [25]:
retriever_result = []
for tmp_q in tqdm(range(len(column_lists['question'][:s_index])), desc='Get retriever result'):
    # print(tmp_q)
    main_query = column_lists['question'][tmp_q]
    sub_query = rag_chain.invoke({"query": main_query})
    retriever_result.append([retriever.invoke(main_query) , retriever.invoke(sub_query.question1) , retriever.invoke(sub_query.question2) , retriever.invoke(sub_query.question3)])

Get retriever result: 100%|██████████| 3/3 [06:28<00:00, 129.50s/it]


In [41]:
reciprocal_result = []
for tmp in retriever_result:
    reciprocal_result.append(reciprocal_rank_fusion(tmp))

In [45]:
col_id = column_lists['id'][:s_index]
rerank_result = get_result_retrieva(col_id, reciprocal_result, topk=top_k)
# 用你想要的路径替换'your_folder_path'
create_folder_if_not_exists(folder_path)
# 对结果进行保存
# 指定你想要保存的文件名
# 使用json.dump()将字典保存为json文件
with open(retriever_filename, 'w', encoding='utf-8') as f:
    json.dump(rerank_result, f, ensure_ascii=False, indent=4)
print('完成!')

Folder 'Result/a_rag_fusion/' has been created.
完成!


In [46]:
# dict_keys(['id', 'source', 'page', 'question', 'ground_truth', 'context'])
# column_lists['context'][:s_index]
save_info_result = {}
save_info_result['id'] = column_lists['id'][:s_index]
save_info_result['source'] = column_lists['source'][:s_index]
save_info_result['page'] = column_lists['page'][:s_index]
save_info_result['question'] = column_lists['question'][:s_index]
save_info_result['ground_truth'] = column_lists['ground_truth'][:s_index]
save_info_result['context'] = column_lists['context'][:s_index]
save_info_result['retriever_result_list'] = get_retriever_res_list(reciprocal_result, top_k)

# 使用json.dump()将字典保存为json文件
with open(save_info_result_filename, 'w', encoding='utf-8') as f:
    json.dump(save_info_result, f, ensure_ascii=False, indent=4)

print('完成!')

完成!
