In [1]:
!pip install langchain openai ragas arxiv pymupdf
!pip install llama-index llama-index-llms-huggingface ipywidgets
!pip install transformers -U
!pip install sentence_transformers
!pip install unstructured
!pip install pdfminer
!pip install pypdf PyPDFLoader
!pip install rapidocr-onnxruntime
!pip install langchain_chroma
!pip install pdfplumber

Looking in indexes: https://mirrors.cloud.aliyuncs.com/pypi/simple
Collecting langchain
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/4c/e3/38d4c5969d91e8e4f2ada469d82322d8b9f5d218613c4a18c11be2bda648/langchain-0.2.5-py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting openai
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/1e/b5/483e8c29c5deb562ade661e83c11ed06a28acf5d1e9d8c1944361f6be730/openai-1.35.3-py3-none-any.whl (327 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.4/327.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting ragas
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/9e/55/09fa95c22502322e71a90194712ea23c7381a52c7c7e10ec9215c4d71b93/ragas-0.1.9-py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.1/86.1 kB[

In [8]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

documents = PyPDFLoader("soybean_konw.pdf").load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
texts = text_splitter.split_documents(documents[:])
for idx, text in enumerate(texts):
    text.metadata["id"] = idx

In [9]:
from langchain import PromptTemplate
qa_generate_prompt_tmpl = """\
Context information is below.
---------------------
{context}
---------------------
Given the context information and not prior knowledge.
generate only questions based on the below query.
You are a university professor. Your task is to set only 1 question for the upcoming Chinese quiz.
Questions throughout the test should be diverse.
Questions must be written in Chinese. The expression must be concise and clear.
It should not exceed 20 Chinese characters. Words such as "这", "那", "根据", "依据" and other punctuation marks
should not be used. Abbreviations may be used for titles and professional terms.
"""
prompt = PromptTemplate(
    template=qa_generate_prompt_tmpl, 
    input_variables=["context"]
  )

print(prompt)
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)

rag_chain = (
    {"context": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

input_variables=['context'] template='Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge.\ngenerate only questions based on the below query.\nYou are a university professor. Your task is to set only 1 question for the upcoming Chinese quiz.\nQuestions throughout the test should be diverse.\nQuestions must be written in Chinese. The expression must be concise and clear.\nIt should not exceed 20 Chinese characters. Words such as "这", "那", "根据", "依据" and other punctuation marks\nshould not be used. Abbreviations may be used for titles and professional terms.\n'


In [10]:
content = [texts[i].page_content for i in range(len(texts))]
# content

In [11]:
question_list = rag_chain.batch(content)
# question_list

In [12]:
import pandas as pd

# 示例字典
data_dict = {
    'id': range(len(texts)),
    'question': question_list,
    'context': content
}

# 将字典转换为Pandas DataFrame
df = pd.DataFrame(data_dict)

# 打印DataFrame
print(df)

      id                                     question  \
0      0                             请简述本文发表的时间是什么时候？   
1      1  请简述《Frontiers in Plant Science》中文章的投稿与发表流程。   
2      2                  请简述《大豆产量形成生理学》研究对精准育种的基石作用。   
3      3                         请简述大豆产量形成生理对精准育种的意义。   
4      4                            作物产量持续改进的主要动力是什么？   
..   ...                                          ...   
295  295                   请简述使用低成本3D重建技术分析大豆表型发育的优点。   
296  296                           该研究的作者与BASF公司有何关系？   
297  297                             请简述文章中提到的版权归属情况。   
298  298                           CC BY许可下文章的版权归属是谁？   
299  299               请简述CC BY许可协议对使用版权内容的哪些方面进行了规定？   

                                               context  
0    fpls-12-719706 November 9, 2021 Time: 12:48 # ...  
1    Heart, Italy\n*Correspondence:\nJonathan T. Vo...  
2    This article was submitted to\nCrop and Produc...  
3    Yield Formation Physiology –\nA Foundation for...  
4    Improvement\nJonathan T. 

In [13]:
df.to_excel('soybean_question_500.xlsx', index=False)