In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

documents = []
dc_name = ["soybean_konw.pdf", "soybean2.pdf"]
for tmp_name in dc_name:
    # print(len(PyPDFLoader(tmp_name).load()))
    documents += PyPDFLoader(tmp_name).load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
texts = text_splitter.split_documents(documents[:])
for idx, text in enumerate(texts):
    text.metadata["id"] = idx


In [2]:
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

top_k = 10

model_name = '/mnt/workspace/.cache/modelscope/hub/maple77/zpoint_large_embedding_zh'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
vectorstore = Chroma(persist_directory="soybean_db2", embedding_function=hf)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": top_k}
)

  from tqdm.autonotebook import tqdm, trange


In [3]:
import pandas as pd

df = pd.read_excel('soybean_q_gt_609.xlsx', sheet_name='Sheet1')

# 打印DataFrame的内容
column_lists = {col: df[col].tolist() for col in df.columns}
print(column_lists.keys())

dict_keys(['id', 'source', 'page', 'question', 'ground_truth', 'context'])


In [4]:
from tqdm import tqdm, trange

s_index = 10

retriever_result = []
for tmp_q in tqdm(range(len(column_lists['question'][:s_index])), desc='Get retriever result'):
    # print(tmp_q)
    retriever_result.append(retriever.invoke(column_lists['question'][tmp_q]))

Get retriever result: 100%|██████████| 10/10 [05:43<00:00, 34.32s/it]


In [None]:
# retriever_result = retriever.batch(column_lists['question'][:100])

In [5]:
from retiever_eval_list import get_result_retrieva
col_id = column_lists['id'][:s_index]
retriever_re = get_result_retrieva(col_id, retriever_result)
retriever_re

{'top_3': {'ht_score': 0.2,
  'mmr_score': 0.15,
  'soft_ht_score': 0.4,
  'soft_mmr_score': 0.283,
  'ndcg': 0.313},
 'top_4': {'ht_score': 0.2,
  'mmr_score': 0.15,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.308,
  'ndcg': 0.356},
 'top_5': {'ht_score': 0.2,
  'mmr_score': 0.15,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.328,
  'ndcg': 0.356},
 'top_6': {'ht_score': 0.2,
  'mmr_score': 0.15,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.345,
  'ndcg': 0.392},
 'top_7': {'ht_score': 0.4,
  'mmr_score': 0.179,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.388,
  'ndcg': 0.424},
 'top_8': {'ht_score': 0.4,
  'mmr_score': 0.179,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.413,
  'ndcg': 0.418},
 'top_9': {'ht_score': 0.4,
  'mmr_score': 0.179,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.413,
  'ndcg': 0.418},
 'top_10': {'ht_score': 0.4,
  'mmr_score': 0.179,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.413,
  'ndcg': 0.418}}

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('/mnt/workspace/.cache/modelscope/hub/Xorbits/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('/mnt/workspace/.cache/modelscope/hub/Xorbits/bge-reranker-base')

In [7]:
import torch
question_rerank_result = []
for query_idx in trange(len(column_lists['question'][:s_index]), desc='Rerank result'):
    pairs = []
    for idx in range(len(retriever_result[query_idx])):
        pairs.append([column_lists['question'][:s_index][query_idx], retriever_result[query_idx][idx].page_content])

    with torch.no_grad():
        inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
    combined = sorted(zip(scores, retriever_result[query_idx]), reverse=True)
    scores_rerank_list, retri_rerank_list = zip(*combined)
    # print(scores_rerank_list, retri_rerank_list)
    question_rerank_result.append(retri_rerank_list)

Rerank result: 100%|██████████| 10/10 [03:48<00:00, 22.89s/it]


In [8]:
col_id = column_lists['id'][:s_index]
rerank_result = get_result_retrieva(col_id, question_rerank_result)
rerank_result

{'top_3': {'ht_score': 0.3,
  'mmr_score': 0.2,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.6,
  'ndcg': 0.543},
 'top_4': {'ht_score': 0.3,
  'mmr_score': 0.2,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.6,
  'ndcg': 0.543},
 'top_5': {'ht_score': 0.3,
  'mmr_score': 0.2,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.6,
  'ndcg': 0.543},
 'top_6': {'ht_score': 0.3,
  'mmr_score': 0.2,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.6,
  'ndcg': 0.543},
 'top_7': {'ht_score': 0.3,
  'mmr_score': 0.2,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.6,
  'ndcg': 0.543},
 'top_8': {'ht_score': 0.4,
  'mmr_score': 0.212,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.625,
  'ndcg': 0.562},
 'top_9': {'ht_score': 0.4,
  'mmr_score': 0.212,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.636,
  'ndcg': 0.567},
 'top_10': {'ht_score': 0.4,
  'mmr_score': 0.212,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.656,
  'ndcg': 0.564}}