In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

documents = []
dc_name = ["soybean_konw.pdf", "soybean2.pdf"]
for tmp_name in dc_name:
    # print(len(PyPDFLoader(tmp_name).load()))
    documents += PyPDFLoader(tmp_name).load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
texts = text_splitter.split_documents(documents[:])
for idx, text in enumerate(texts):
    text.metadata["id"] = idx


In [2]:
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

top_k = 10

model_name = '/mnt/workspace/.cache/modelscope/hub/maple77/zpoint_large_embedding_zh'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
vectorstore = Chroma(persist_directory="soybean_db2", embedding_function=hf)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": top_k}
)

  from tqdm.autonotebook import tqdm, trange


In [3]:
import pandas as pd

df = pd.read_excel('soybean_q_gt_609.xlsx', sheet_name='Sheet1')

# 打印DataFrame的内容
column_lists = {col: df[col].tolist() for col in df.columns}
print(column_lists.keys())

dict_keys(['id', 'source', 'page', 'question', 'ground_truth', 'context'])


In [4]:
from tqdm import tqdm, trange

s_index = 10

retriever_result = []
for tmp_q in tqdm(range(len(column_lists['question'][:s_index])), desc='Get retriever result'):
    # print(tmp_q)
    retriever_result.append(retriever.invoke(column_lists['question'][tmp_q]))

Get retriever result: 100%|██████████| 10/10 [06:14<00:00, 37.45s/it]


In [None]:
# retriever_result = retriever.batch(column_lists['question'][:100])

In [5]:
from retiever_eval_list import get_result_retrieva
col_id = column_lists['id'][:s_index]
retriever_re = get_result_retrieva(col_id, retriever_result)
retriever_re

{'top_3': {'ht_score': 0.2,
  'mmr_score': 0.15,
  'soft_ht_score': 0.4,
  'soft_mmr_score': 0.283,
  'ndcg': 0.313},
 'top_4': {'ht_score': 0.2,
  'mmr_score': 0.15,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.308,
  'ndcg': 0.356},
 'top_5': {'ht_score': 0.2,
  'mmr_score': 0.15,
  'soft_ht_score': 0.5,
  'soft_mmr_score': 0.328,
  'ndcg': 0.356},
 'top_6': {'ht_score': 0.2,
  'mmr_score': 0.15,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.345,
  'ndcg': 0.392},
 'top_7': {'ht_score': 0.3,
  'mmr_score': 0.164,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.374,
  'ndcg': 0.391},
 'top_8': {'ht_score': 0.4,
  'mmr_score': 0.177,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.399,
  'ndcg': 0.414},
 'top_9': {'ht_score': 0.4,
  'mmr_score': 0.177,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.399,
  'ndcg': 0.414},
 'top_10': {'ht_score': 0.4,
  'mmr_score': 0.177,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.399,
  'ndcg': 0.414}}

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('/mnt/workspace/.cache/modelscope/hub/Xorbits/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('/mnt/workspace/.cache/modelscope/hub/Xorbits/bge-reranker-base')

In [7]:
import torch
question_rerank_result = []
for query_idx in trange(len(column_lists['question'][:s_index]), desc='Rerank result'):
    pairs = []
    for idx in range(len(retriever_result[query_idx])):
        pairs.append([column_lists['question'][:s_index][query_idx], retriever_result[query_idx][idx].page_content])

    with torch.no_grad():
        inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
    combined = sorted(zip(scores, retriever_result[query_idx]), reverse=True)
    scores_rerank_list, retri_rerank_list = zip(*combined)
    # print(scores_rerank_list, retri_rerank_list)
    question_rerank_result.append(retri_rerank_list)

Rerank result: 100%|██████████| 10/10 [04:25<00:00, 26.57s/it]


In [8]:
col_id = column_lists['id'][:s_index]
rerank_result = get_result_retrieva(col_id, question_rerank_result)
rerank_result

{'top_3': {'ht_score': 0.3,
  'mmr_score': 0.25,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.55,
  'ndcg': 0.563},
 'top_4': {'ht_score': 0.3,
  'mmr_score': 0.25,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.55,
  'ndcg': 0.563},
 'top_5': {'ht_score': 0.3,
  'mmr_score': 0.25,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.55,
  'ndcg': 0.563},
 'top_6': {'ht_score': 0.3,
  'mmr_score': 0.25,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.55,
  'ndcg': 0.563},
 'top_7': {'ht_score': 0.3,
  'mmr_score': 0.25,
  'soft_ht_score': 0.6,
  'soft_mmr_score': 0.55,
  'ndcg': 0.563},
 'top_8': {'ht_score': 0.4,
  'mmr_score': 0.263,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.575,
  'ndcg': 0.583},
 'top_9': {'ht_score': 0.4,
  'mmr_score': 0.263,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.586,
  'ndcg': 0.587},
 'top_10': {'ht_score': 0.4,
  'mmr_score': 0.263,
  'soft_ht_score': 0.7,
  'soft_mmr_score': 0.606,
  'ndcg': 0.584}}

In [11]:
len(question_rerank_result), len(question_rerank_result[0]), len(column_lists['question'][:s_index]), len(column_lists['ground_truth'][:s_index])

(10, 10, 10, 10)

In [12]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import ChatOpenAI
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)

llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)

  warn_deprecated(


In [13]:
from langchain import PromptTemplate

template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 

Question: {question} 

Context: {context} 

Answer:
"""

prompt = PromptTemplate(
    template=template, 
    input_variables=["context","question"]
  )

print(prompt)

input_variables=['context', 'question'] template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \n\nQuestion: {question} \n\nContext: {context} \n\nAnswer:\n"


In [16]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import ChatOpenAI

# llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
rag_chain = (
    {"context": RunnablePassthrough(),  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [26]:
answers = []
context_list = []
rerank_top_k = 3

for i in trange(len(column_lists['question'][:s_index]), desc='Get answer'):
    query_tmp = column_lists['question'][i]
    context_tmp = ''.join(qr.page_content for qr in question_rerank_result[i][:rerank_top_k])
    inputs = {"context": context_tmp, "question": query_tmp}
    answers.append(rag_chain.invoke(inputs))
    context_list.append([docs.page_content for docs in question_rerank_result[i][:rerank_top_k]])

Get answer: 100%|██████████| 10/10 [00:46<00:00,  4.68s/it]


In [27]:
from datasets import Dataset
from langchain_community.chat_models import ChatZhipuAI
import os
# 构建数据
data = {
    "question": column_lists['question'][:s_index],
    "answer": answers,
    "contexts": context_list,
    "ground_truth": column_lists['ground_truth'][:s_index]
}
dataset = Dataset.from_dict(data)
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 10
})

In [28]:
import json
# 指定你想要保存的文件名
filename = "my_data.json"

# 使用json.dump()将字典保存为json文件
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# 指定之前保存的文件名
filename = "my_data.json"

# 使用json.load()从json文件加载数据到字典
with open(filename, 'r', encoding='utf-8') as f:
    loaded_dict = json.load(f)

print(loaded_dict)  # 输出加载的字典内容

In [30]:
# !pip install --upgrade httpx httpx-sse PyJWT
os.environ["ZHIPUAI_API_KEY"] = "661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs"
chat = ChatZhipuAI(
    model="glm-4-0520",
    temperature=0.5,
)
metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
]
langchain_embeddings = LangchainEmbeddingsWrapper(hf)

In [31]:
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 10
})

In [32]:
results = evaluate(dataset = dataset, metrics=metrics, llm=chat, embeddings=langchain_embeddings)
results

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Failed to parse output. Returning None.


{'faithfulness': 0.7395, 'answer_relevancy': 0.6588, 'context_recall': 0.5000, 'context_precision': 0.5583}

In [33]:
results.to_pandas()

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision
0,请简述本文的发表时间。,本文的发表时间为2021年11月15日。,"[fpls-12-719706 November 9, 2021 Time: 12:48 #...",2021年11月15日,1.0,0.864029,1.0,1.0
1,本文第一作者的工作单位是什么？,本文第一作者的工作单位是BASF Corporation或其子公司。,[achieving the goal of using precision breedin...,BASF,0.5,0.899457,0.0,1.0
2,该文章发表在哪个期刊的哪个部分？,该文章发表在期刊《Frontiers in Plant Science》的“Crop and...,"[Heart, Italy\n*Correspondence:\nJonathan T. V...",Frontiers in Plant Science的Crop and Product Ph...,1.0,0.886989,1.0,1.0
3,《大豆产量形成生理学》一文的发表时间是什么时候？,《大豆产量形成生理学》一文的发表时间是2021年11月15日。,[This article was submitted to\nCrop and Produ...,2021年,0.0,0.787779,1.0,1.0
4,作者们致力于什么方面的作物改良？,作者们致力于作物产量的持续改良，特别是通过基因组编辑和精准育种方法改善作物生长速率、叶面积持...,"[altered fruit size, inﬂorescence branching, a...",提高作物产量,1.0,0.787484,1.0,0.583333
5,植物育种家提高作物产量的方法是什么？,植物育种家提高作物产量的方法是通过对作物的遗传改良，选择和培育具有更高产量的品种。这通常涉及...,"[Improvement\nJonathan T. Vogel *, Weidong Liu...",表现型选择,,0.765126,0.0,0.0
6,分子机制如何影响植物生理过程？,分子机制通过影响植物体内的基因和生物化学途径，进而影响植物的各种生理过程。具体来说，分子、遗...,"[based selection, without speciﬁc knowledge of...",通过影响与生理过程相关的基因和途径。,1.0,0.826295,1.0,1.0
7,哪些基因和途径对产量潜力有贡献？,哪些基因和途径对产量潜力有贡献？\n\n对于大豆产量潜力的贡献，相关研究提到了遗传技术和育种...,"[1281663\nSpecht, J. E., and Williams, J. H. (...",许多生理过程的基因和途径,0.555556,0.0,0.0,0.0
8,作物产量生理学的基础分子机制是什么？,作物产量生理学的基础分子机制指的是影响作物产量形成的生理过程中的分子层面的作用机制。具体到基...,[Each phase plays a role in yield formation th...,基础分子机制是指影响作物产量的分子层面的过程和结构。,0.6,0.0,0.0,0.0
9,作物生理与产量限制过程分子机制是什么？,作物生理与产量限制过程的分子机制涉及到影响作物生长和发育的生理过程中的生化路径和基因表达。文...,[biochemical knowledge to improve the physiolo...,作物生理、育种、遗传和分子知识的整合可确定相关产量性状的精准育种目标。,1.0,0.771089,0.0,0.0
