In [16]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

documents = []
dc_name = ["soybean_konw.pdf", "soybean2.pdf"]
for tmp_name in dc_name:
    # print(len(PyPDFLoader(tmp_name).load()))
    documents += PyPDFLoader(tmp_name).load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
texts = text_splitter.split_documents(documents[:])
for idx, text in enumerate(texts):
    text.metadata["id"] = idx
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

top_k = 10

model_name = '/mnt/workspace/.cache/modelscope/hub/maple77/zpoint_large_embedding_zh'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
vectorstore = Chroma(persist_directory="soybean_db2", embedding_function=hf)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": top_k}
)

In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from typing import List
from typing import Literal, Optional, Tuple
from langchain_core.output_parsers import PydanticOutputParser
# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: Literal["yes", "no"] = Field(
        ...,
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

# LLM with function call
llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)
# structured_llm_grader = llm.with_structured_output(GradeDocuments)
# Set up a parser
parser = PydanticOutputParser(pydantic_object=GradeDocuments)
# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant. \n
    Answer the user query. Wrap the output in `json` tags\n{format_instructions}"""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
).partial(format_instructions=parser.get_format_instructions())
llm = ChatOpenAI(
    temperature=0.7,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)

retrieval_grader = grade_prompt | llm | parser
# question = "agent memory"
# docs = retriever.invoke(question)
# doc_txt = texts[3].page_content
# print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

In [3]:
### Generate
from langchain import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
# Prompt
# prompt = hub.pull("rlm/rag-prompt")
template = """
You are an expert in phenomics in agronomy, and you have a very rich knowledge of agronomy and phenomics.
Use the context snippets retrieved below to answer the agronomy question from a phenotypic point of view.
If if you don't know the answer, say you don't know.

Question: {question} 

Context: {context} 

Answer:
"""

prompt = PromptTemplate(
    template=template, 
    input_variables=["context","question"]
  )
# LLM
llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)


# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
# rag_chain = prompt | llm | StrOutputParser()
rag_chain = (
    {"context": RunnablePassthrough(),  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)
# Run
# generation = rag_chain.invoke({"context": docs, "question": question})
# print(generation)

In [4]:
### Hallucination Grader
from typing import List
from typing import Literal, Optional, Tuple

# Data model
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in generation answer."""

    binary_score: Literal["yes", "no"] = Field(
        ...,
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )

parser2 = PydanticOutputParser(pydantic_object=GradeHallucinations)
# LLM with function call
llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)
# structured_llm_grader = llm.with_structured_output(GradeHallucinations)

# Prompt
system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
     Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts.
     Answer the user query. Wrap the output in `json` tags\n{format_instructions}"""
hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

hallucination_grader = hallucination_prompt | llm | parser2
# hallucination_grader.invoke({"documents": docs, "generation": generation})

In [5]:
# Data model
class GradeAnswer(BaseModel):
    """Binary score to assess answer addresses question."""

    binary_score: Literal["yes", "no"] = Field(
        ...,
        description="Answer addresses the question, 'yes' or 'no'"
    )

parser3 = PydanticOutputParser(pydantic_object=GradeHallucinations)
# LLM with function call
llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)

# Prompt
system = """You are a grader assessing whether an answer addresses / resolves a question \n 
     Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question.\n
     Answer the user query. Wrap the output in `json` tags\n{format_instructions}"""
answer_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

answer_grader = answer_prompt | llm | parser3
# answer_grader.invoke({"question": question, "generation": generation})

In [6]:
### Question Re-writer

# LLM
llm = ChatOpenAI(
    temperature=0.6,
    model="glm-4-0520",
    openai_api_key="661a7aa0aeb8ca129eb4647461123230.bl9w581QKpnMfBvs",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)

# Prompt
system = """You a question re-writer that converts an input question to a better version that is optimized \n 
     for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
re_write_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        (
            "human",
            "Here is the initial question: \n\n {question} \n Formulate an improved question.",
        ),
    ]
)

question_rewriter = re_write_prompt | llm | StrOutputParser()
# question_rewriter.invoke({"question": question})

In [7]:
from typing import List

from typing_extensions import TypedDict


class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        generation: LLM generation
        documents: list of documents
    """

    question: str
    generation: str
    documents: List[str]

In [8]:
### Nodes


def retrieve(state):
    """
    Retrieve documents

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, documents, that contains retrieved documents
    """
    print("---检索文档中---")
    question = state["question"]

    # Retrieval
    documents = retriever.invoke(question)
    return {"documents": documents, "question": question}

In [9]:
def generate(state):
    """
    Generate answer

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation, that contains LLM generation
    """
    print("---正在根据检索结果 生成回答---")
    question = state["question"]
    documents = state["documents"]
    dok_tmp = '\n\n'.join([dk.page_content for dk in documents])

    # RAG generation
    generation = rag_chain.invoke({"context": dok_tmp, "question": question})
    return {"documents": documents, "question": question, "generation": generation}

In [10]:
def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with only filtered relevant documents
    """

    print("---检查该问题是否与检索文档相关---")
    question = state["question"]
    documents = state["documents"]

    # Score each doc
    filtered_docs = []
    for d in documents:
        score = retrieval_grader.invoke(
            {"question": question, "document": d.page_content}
        )
        grade = score.binary_score
        if grade == "yes":
            print("---GRADE: 该文档与结果相关---")
            filtered_docs.append(d)
        else:
            print("---GRADE: 该文档与结果不相关---")
            continue
    return {"documents": filtered_docs, "question": question}

In [11]:
def transform_query(state):
    """
    Transform the query to produce a better question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates question key with a re-phrased question
    """

    print("---将问题进行重写一个新的问题---")
    question = state["question"]
    documents = state["documents"]

    # Re-write question
    better_question = question_rewriter.invoke({"question": question})
    return {"documents": documents, "question": better_question}

In [12]:
def decide_to_generate(state):
    """
    Determines whether to generate an answer, or re-generate a question.

    Args:
        state (dict): The current graph state

    Returns:
        str: Binary decision for next node to call
    """

    print("---ASSESS GRADED DOCUMENTS---")
    state["question"]
    filtered_documents = state["documents"]

    if not filtered_documents:
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print(
            "---DECISION: 所有文档不相干，重写问题---"
        )
        return "transform_query"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION:文档相关，生成新的答案---")
        return "generate"

In [13]:
from pprint import pprint
def grade_generation_v_documents_and_question(state):
    """
    Determines whether the generation is grounded in the document and answers question.

    Args:
        state (dict): The current graph state

    Returns:
        str: Decision for next node to call
    """

    print("---检查是否存在幻觉---")
    question = state["question"]
    documents = state["documents"]
    generation = state["generation"]
    dok_tmp = '\n\n'.join([dk.page_content for dk in documents])

    score = hallucination_grader.invoke(
        {"documents": dok_tmp, "generation": generation}
    )
    grade = score.binary_score

    # Check hallucination
    if grade == "yes":
        print("---DECISION: 生成的回答是基于文档的---")
        # Check question-answering
        print("---GRADE GENERATION vs QUESTION---")
        score = answer_grader.invoke({"question": question, "generation": generation})
        grade = score.binary_score
        if grade == "yes":
            print("---DECISION: 生成回答解决得来问题---")
            return "useful"
        else:
            print("---DECISION: 生成回答不解决问题---")
            return "not useful"
    else:
        pprint("---DECISION: 生成回答不以文档为基础，重写问题---")
        return "not supported"

In [17]:
from langgraph.graph import END, StateGraph, START

workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generatae
workflow.add_node("transform_query", transform_query)  # transform_query

# Build graph
workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "transform_query": "transform_query",
        "generate": "generate",
    },
)
workflow.add_edge("transform_query", "retrieve")
workflow.add_conditional_edges(
    "generate",
    grade_generation_v_documents_and_question,
    {
        "not supported": "generate",
        "useful": END,
        "not useful": "transform_query",
    },
)

# Compile
app = workflow.compile()

In [18]:
from pprint import pprint

# Run
inputs = {"question": "What are the characteristics of high-yielding soybeans phenotypically, such as growth rate? Answer as much as you can."}
for output in app.stream(inputs):
    for key, value in output.items():
        # Node
        pprint(f"Node '{key}':")
        # Optional: print full state at each node
        # pprint.pprint(value["keys"], indent=2, width=80, depth=None)
    pprint("\n---\n")

# Final generation
pprint(value["generation"])

---检索文档中---
"Node 'retrieve':"
'\n---\n'
---检查该问题是否与检索文档相关---
---GRADE: 该文档与结果相关---
---GRADE: 该文档与结果不相关---
---GRADE: 该文档与结果相关---
---GRADE: 该文档与结果相关---
---GRADE: 该文档与结果不相关---
---GRADE: 该文档与结果不相关---
---GRADE: 该文档与结果相关---
---GRADE: 该文档与结果不相关---
---GRADE: 该文档与结果不相关---
---GRADE: 该文档与结果不相关---
---ASSESS GRADED DOCUMENTS---
---DECISION:文档相关，生成新的答案---
"Node 'grade_documents':"
'\n---\n'
---正在根据检索结果 生成回答---
---检查是否存在幻觉---
---DECISION: 生成的回答是基于文档的---
---GRADE GENERATION vs QUESTION---
---DECISION: 生成回答解决得来问题---
"Node 'generate':"
'\n---\n'
('High-yielding soybeans, from a phenotypic perspective, exhibit several '
 'characteristics that contribute to their superior yield potential. Based on '
 'the provided context, the following points can be noted:\n'
 '\n'
 '1. **Growth Rate**: High-yielding soybeans tend to have an optimal growth '
 'rate during both the vegetative and reproductive phases. The critical period '
 'for seed number determination is significantly influenced by crop growth '
 'rate

In [21]:
print(value["generation"])

High-yielding soybeans, from a phenotypic perspective, exhibit several characteristics that contribute to their superior yield potential. Based on the provided context, the following points can be noted:

1. **Growth Rate**: High-yielding soybeans tend to have an optimal growth rate during both the vegetative and reproductive phases. The critical period for seed number determination is significantly influenced by crop growth rate, duration, and dry matter accumulation (Monzon et al., 2021). Thus, a robust growth rate during these phases is likely a key characteristic of high-yielding soybeans.

2. **Stress Tolerance**: Given that stress, particularly during the reproductive phase, is a major limitation to high yield, high-yielding soybeans would be expected to exhibit better tolerance to various stresses such as drought, heat, or disease.

3. **Plant Architecture**: Newer high-yielding cultivars tend to have a shorter plant height, which is associated with reduced lodging (Specht and W