In [6]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

documents = []
dc_name = ["soybean_konw.pdf", "soybean2.pdf"]
for tmp_name in dc_name:
    # print(len(PyPDFLoader(tmp_name).load()))
    documents += PyPDFLoader(tmp_name).load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
texts = text_splitter.split_documents(documents[:])
for idx, text in enumerate(texts):
    text.metadata["id"] = idx


In [9]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)


  warn_deprecated(


In [10]:
from typing import List
from typing import Literal, Optional, Tuple
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field


class Soybean_Q_GT(BaseModel):

    question: str = Field(
        ..., description="Given contextual information, not prior knowledge. Generate questions based only on the following queries."
    )
    ground_truth: str = Field(
        ..., description="Given contextual information, not prior knowledge. Give the corresponding answers only according to the questions generated above."
    )

# Set up a parser
parser = PydanticOutputParser(pydantic_object=Soybean_Q_GT)

In [35]:
from langchain import PromptTemplate
# Prompt
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user query. Wrap the output in `json` tags\n{format_instructions}",
        ),
        (
            "human", 
            """
            Context information is below.
            ---------------------
            {context}
            ---------------------
            Given contextual information, not prior knowledge.Ask questions and find out the truth in the context below.
            You're a college professor. Your task is to set a question for the upcoming Chinese test and give the corresponding standard answer. The questions should be diverse throughout the exam.
            Questions must be written in Chinese. The questions are concise and the standard answers are as varied as possible.
            No more than 30 Chinese characters. Punctuation marks such as "this", "that", "according to", "according to" should not be used. Abbreviations can be used for titles and technical terms.
            """
        ),
    ]
).partial(format_instructions=parser.get_format_instructions())


In [36]:
rag_chain = (
    {"context": RunnablePassthrough()} 
    | prompt 
    | llm
    | parser 
)


In [None]:
query = "北豆40，为产于黑龙江省的大豆种子。是由黑龙江省农垦总局红兴隆科学研究所与黑龙江省农垦科研育种中心共同研发的产品。品种特性该品种平均生育期120天，长叶、紫花、亚有限结荚习性。株高85.8厘米，单株有效荚数34.4个，百粒重19.0克。籽粒圆形，种皮黄色，黄脐。接种鉴定，中抗大豆灰斑病，中抗SMVⅠ号株系，中感SMVⅢ号株系。粗蛋白质含量40.78%，粗脂肪含量21.99%。产量表现2006年参加北方春大豆中早熟组品种区域试验，亩产210.6千克，比对照绥农14增产4.0%，极显著；2007年续试，亩产181.2千克，比对照增产6.5%，极显著；两年区域试验亩产195.9千克，比对照增产5.2%。2007年生产试验，亩产166.8千克，比对照增产5.2%。栽培技术要点地温稳定通过7～8℃开始播种，适宜种植密度为每亩1.6万～1.7万株；以深秋施肥为好，每亩施纯量化肥8～10千克，氮、磷、钾比例1∶1.15～1.5∶0.8。该品种符合国家大豆品种审定标准，通过审定。适宜在黑龙江省第二积温带和第三积温带上限，吉林省东部地区春播种植。"

print(prompt.invoke(query).to_string())
z = rag_chain.invoke({"context": query})
z

In [37]:
content = [texts[i].page_content for i in range(len(texts))]
idk = [texts[i].metadata['id'] for i in range(len(texts))]
source = [texts[i].metadata['source'] for i in range(len(texts))]
page = [texts[i].metadata['page'] for i in range(len(texts))]

In [38]:
question_gt = rag_chain.batch(content[:])
print(len(question_gt))

609


In [39]:
question_list, ground_th_list = [], []
for j in question_gt:
    question_list.append(j.question)
    ground_th_list.append(j.ground_truth)

In [40]:
import pandas as pd

# 示例字典
data_dict = {
    'id': idk,
    'source': source,
    'page': page,
    'question': question_list,
    'ground_truth': ground_th_list,
    'context': content[:]
}

# 将字典转换为Pandas DataFrame
df = pd.DataFrame(data_dict)

# 打印DataFrame
print(df)

      id            source  page  \
0      0  soybean_konw.pdf     0   
1      1  soybean_konw.pdf     0   
2      2  soybean_konw.pdf     0   
3      3  soybean_konw.pdf     0   
4      4  soybean_konw.pdf     0   
..   ...               ...   ...   
604  604      soybean2.pdf    14   
605  605      soybean2.pdf    14   
606  606      soybean2.pdf    14   
607  607      soybean2.pdf    15   
608  608      soybean2.pdf    15   

                                              question  \
0                                          请简述本文的发表时间。   
1                                      本文第一作者的工作单位是什么？   
2                                     该文章发表在哪个期刊的哪个部分？   
3                             《大豆产量形成生理学》一文的发表时间是什么时候？   
4                                     作者们致力于什么方面的作物改良？   
..                                                 ...   
604           《自然》期刊中哪一年发表了Williams和Hinton关于反向传播误差的文章？   
605                           《欧洲期刊》中关于无人机的研究主要包括哪些方面？   
606                            无人机获取的高分辨率图像通常

In [42]:
df.to_excel('soybean_q_gt_609.xlsx', index=False)