In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

documents = []
dc_name = ["soybean_konw.pdf", "soybean2.pdf"]
for tmp_name in dc_name:
    # print(len(PyPDFLoader(tmp_name).load()))
    documents += PyPDFLoader(tmp_name).load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
texts = text_splitter.split_documents(documents[:])
for idx, text in enumerate(texts):
    text.metadata["id"] = idx


In [1]:
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

top_k = 10

model_name = '/mnt/workspace/.cache/modelscope/hub/maple77/zpoint_large_embedding_zh'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
vectorstore = Chroma(persist_directory="soybean_db2", embedding_function=hf)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": top_k}
)

  from tqdm.autonotebook import tqdm, trange


In [2]:
import pandas as pd

df = pd.read_excel('soybean_q_gt_609.xlsx', sheet_name='Sheet1')

# 打印DataFrame的内容
column_lists = {col: df[col].tolist() for col in df.columns}
print(column_lists.keys())

dict_keys(['id', 'source', 'page', 'question', 'ground_truth', 'context'])


In [3]:
from tqdm import tqdm, trange

s_index = 10

retriever_result = []
for tmp_q in tqdm(range(len(column_lists['question'][:s_index])), desc='Get retriever result'):
    # print(tmp_q)
    retriever_result.append(retriever.invoke(column_lists['question'][tmp_q]))

Get retriever result: 100%|██████████| 10/10 [00:40<00:00,  4.03s/it]


In [None]:
# retriever_result = retriever.batch(column_lists['question'][:100])

In [4]:
from retiever_eval_list import get_result_retrieva
col_id = column_lists['id'][:s_index]
retriever_re = get_result_retrieva(col_id, retriever_result)
retriever_re

{'top_3': {'ht_score': 0.1,
  'mmr_score': 0.1,
  'soft_ht_score': 0.3,
  'soft_mmr_score': 0.233,
  'ndcg': 0.25},
 'top_4': {'ht_score': 0.1,
  'mmr_score': 0.1,
  'soft_ht_score': 0.4,
  'soft_mmr_score': 0.258,
  'ndcg': 0.293},
 'top_5': {'ht_score': 0.1,
  'mmr_score': 0.1,
  'soft_ht_score': 0.4,
  'soft_mmr_score': 0.278,
  'ndcg': 0.293},
 'top_6': {'ht_score': 0.1,
  'mmr_score': 0.1,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.295,
  'ndcg': 0.329},
 'top_7': {'ht_score': 0.2,
  'mmr_score': 0.114,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.324,
  'ndcg': 0.328},
 'top_8': {'ht_score': 0.3,
  'mmr_score': 0.127,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.349,
  'ndcg': 0.35},
 'top_9': {'ht_score': 0.3,
  'mmr_score': 0.127,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.36,
  'ndcg': 0.353},
 'top_10': {'ht_score': 0.3,
  'mmr_score': 0.127,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.36,
  'ndcg': 0.353}}

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('/mnt/workspace/.cache/modelscope/hub/Xorbits/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('/mnt/workspace/.cache/modelscope/hub/Xorbits/bge-reranker-base')

In [6]:
import torch
question_rerank_result = []
for query_idx in trange(len(column_lists['question'][:s_index]), desc='Rerank result'):
    pairs = []
    for idx in range(len(retriever_result[query_idx])):
        pairs.append([column_lists['question'][:s_index][query_idx], retriever_result[query_idx][idx].page_content])

    with torch.no_grad():
        inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
    combined = sorted(zip(scores, retriever_result[query_idx]), reverse=True)
    scores_rerank_list, retri_rerank_list = zip(*combined)
    # print(scores_rerank_list, retri_rerank_list)
    question_rerank_result.append(retri_rerank_list)

Rerank result: 100%|██████████| 10/10 [00:54<00:00,  5.44s/it]


In [7]:
col_id = column_lists['id'][:s_index]
rerank_result = get_result_retrieva(col_id, question_rerank_result)
rerank_result

{'top_3': {'ht_score': 0.2,
  'mmr_score': 0.1,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.5,
  'ndcg': 0.443},
 'top_4': {'ht_score': 0.2,
  'mmr_score': 0.1,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.5,
  'ndcg': 0.443},
 'top_5': {'ht_score': 0.2,
  'mmr_score': 0.1,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.5,
  'ndcg': 0.443},
 'top_6': {'ht_score': 0.2,
  'mmr_score': 0.1,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.5,
  'ndcg': 0.443},
 'top_7': {'ht_score': 0.2,
  'mmr_score': 0.1,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.5,
  'ndcg': 0.443},
 'top_8': {'ht_score': 0.3,
  'mmr_score': 0.113,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.525,
  'ndcg': 0.462},
 'top_9': {'ht_score': 0.3,
  'mmr_score': 0.113,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.536,
  'ndcg': 0.467},
 'top_10': {'ht_score': 0.3,
  'mmr_score': 0.113,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.556,
  'ndcg': 0.464}}

In [11]:
len(question_rerank_result), len(question_rerank_result[0]), len(column_lists['question'][:s_index]), len(column_lists['ground_truth'][:s_index])

(10, 10, 10, 10)

In [12]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import ChatOpenAI
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)

llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)

In [13]:
from langchain import PromptTemplate

template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 

Question: {question} 

Context: {context} 

Answer:
"""

prompt = PromptTemplate(
    template=template, 
    input_variables=["context","question"]
  )

print(prompt)

input_variables=['context', 'question'] template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \n\nQuestion: {question} \n\nContext: {context} \n\nAnswer:\n"


In [14]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import ChatOpenAI

# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
rag_chain = (
    {"context": RunnablePassthrough(),  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [15]:
answers = []
context_list = []
for i in trange(len(column_lists['question'][:s_index]), desc='Get answer'):
    query_tmp = column_lists['question'][i]
    context_tmp = ''.join(qr.page_content for qr in question_rerank_result[i])
    inputs = {"context": context_tmp, "question": query_tmp}
    answers.append(rag_chain.invoke(inputs))
    context_list.append([docs.page_content for docs in question_rerank_result[i]])

Get answer: 100%|██████████| 10/10 [01:02<00:00,  6.25s/it]


In [16]:
from datasets import Dataset
from langchain_community.chat_models import ChatZhipuAI
import os
# 构建数据
data = {
    "question": column_lists['question'][:s_index],
    "answer": answers,
    "contexts": context_list,
    "ground_truth": column_lists['ground_truth'][:s_index]
}
dataset = Dataset.from_dict(data)
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 10
})

In [27]:
import json
# 指定你想要保存的文件名
filename = "my_data.json"

# 使用json.dump()将字典保存为json文件
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# 指定之前保存的文件名
filename = "my_data.json"

# 使用json.load()从json文件加载数据到字典
with open(filename, 'r', encoding='utf-8') as f:
    loaded_dict = json.load(f)

print(loaded_dict)  # 输出加载的字典内容

In [17]:
# !pip install --upgrade httpx httpx-sse PyJWT
os.environ["ZHIPUAI_API_KEY"] = "661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs"
chat = ChatZhipuAI(
    model="glm-4-0520",
    temperature=0.5,
)
metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
]
langchain_embeddings = LangchainEmbeddingsWrapper(hf)

In [25]:
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 10
})

In [24]:
results = evaluate(dataset = dataset, metrics=metrics, llm=chat, embeddings=langchain_embeddings)
results

Exception in thread Thread-11:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.10/site-packages/ragas/executor.py", line 87, in run
    results = self.loop.run_until_complete(self._aresults())
  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 625, in run_until_complete
    self._check_running()
  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 584, in _check_running
    raise RuntimeError('This event loop is already running')
RuntimeError: This event loop is already running
  self._invoke_excepthook(self)
  m = tuple(map(os.fspath, m))


ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.

  await self._event_pipe_gc()
