In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def replace_tab_with_space(texts):

    for text in texts:
        text.page_content=text.page_content.replace("\t"," ")
    return texts


def encode_pdf(path,token_size=300,token_overlap=50):
    loader=PyPDFLoader(path)
    docs=loader.load()

    text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=token_size,chunk_overlap=token_overlap)
    texts=text_splitter.split_documents(docs)

    clean_texts=replace_tab_with_space(texts)

    embedding_model=OllamaEmbeddings(model="qwen3-embedding:0.6b")

    vectorstore=Chroma.from_documents(embedding=embedding_model,documents=clean_texts,collection_name="qwen3_embedding_1024" )

    return vectorstore


def retriever_context_per_question(question,query_retriever):

    docs=query_retriever.invoke(question,k=2)

    context=[doc.page_content for doc in docs]

    return context,docs

def show_context(context):
    for i,c in enumerate(context):
        print(f"context{i+1}:")
        print(c)
        print("\n")



In [3]:
path=r"C:\Users\user\Desktop\暑假語言模型\RAG資料集\2401.15884v3.pdf"

vectorstore=encode_pdf(path)

retriever=vectorstore.as_retriever()

question="What the CRAG proposed to improve "


In [4]:
context,docs=retriever_context_per_question(question,retriever)
show_context(context)


context1:
implemented into RAG (Lewis et al., 2020) and
Self-RAG (Asai et al., 2024) for demonstrating its
adaptability to RAG-based approaches. Results on
four datasets of PopQA (Mallen et al., 2023), Biog-
raphy (Min et al., 2023), Pub Health (Zhang et al.,
2023a), and Arc-Challenge (Bhakthavatsalam et al.,
2021) show that CRAG can significantly improve
the performance of standard RAG and state-of-the-
art Self-RAG, demonstrating its generalizability
across both short- and long-form generation tasks.
To facilitate others to reproduce our results, we will
publish all source code later.
In summary, our contributions in this paper are
three-fold: 1) This paper studies the scenarios
where the retriever returns inaccurate results and,
to the best of our knowledge, makes the first
attempt to design corrective strategies for RAG to
improve its robustness. 2) A plug-and-play method
named CRAG is proposed to improve the ability of
automatic self-correction and efficient utilization
of retriev

##### 讓llm自己判斷檢索文章和內容有沒有相關

In [5]:
from pydantic import BaseModel,Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama


class GradDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score : str = Field(
        ...,
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

llm=ChatOllama(model="llama3.2:3b-instruct-q8_0",temperature=0)
structured_llm_grader=llm.with_structured_output(GradDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

#structured_llm_grader內建已經有output parser
grade_chain=grade_prompt | structured_llm_grader



In [6]:
#Filter out the non-relevant docs
docs_to_use=[]
for doc in docs :
    print(doc.page_content, "\n" ,"-"*50)
    res=grade_chain.invoke({"question":question,"document":doc.page_content})
    print(res,"\n")
    if res.binary_score == "yes":
        docs_to_use.append(doc)


implemented into RAG (Lewis et al., 2020) and
Self-RAG (Asai et al., 2024) for demonstrating its
adaptability to RAG-based approaches. Results on
four datasets of PopQA (Mallen et al., 2023), Biog-
raphy (Min et al., 2023), Pub Health (Zhang et al.,
2023a), and Arc-Challenge (Bhakthavatsalam et al.,
2021) show that CRAG can significantly improve
the performance of standard RAG and state-of-the-
art Self-RAG, demonstrating its generalizability
across both short- and long-form generation tasks.
To facilitate others to reproduce our results, we will
publish all source code later.
In summary, our contributions in this paper are
three-fold: 1) This paper studies the scenarios
where the retriever returns inaccurate results and,
to the best of our knowledge, makes the first
attempt to design corrective strategies for RAG to
improve its robustness. 2) A plug-and-play method
named CRAG is proposed to improve the ability of
automatic self-correction and efficient utilization
of retrieved documen

##### 用llm認為相關的文章回答答案

In [7]:
from langchain_core.output_parsers import StrOutputParser
system = """You are an assistant for question-answering tasks.
Answer the question based upon your knowledge.
Use three-to-six sentences maximum and keep the answer concise."""

#<docs>是標籤讓讓llm更好理解範圍
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved documents: \n\n <docs>{documents}</docs> \n\n "
         "User question: <question>{question}</question>"),
    ]
)

llm=ChatOllama(model="llama3.2:3b-instruct-q8_0",temperature=0)

#post-processing
def forma_docs(docs):
    return "\n".join(
        f"<doc{i+1}>: \n"
        f"content:{doc.page_content} \n"
        f"</doc{i+1}> \n"
        for i,doc in enumerate (docs)
    )

final_chain=prompt | llm | StrOutputParser()

generation=final_chain.invoke({"question":question,"documents":forma_docs(docs_to_use)})
print(generation)

CRAG (Corrective Retrieval Augmented Generation) is proposed to improve the robustness of generation in RAG-based approaches. Specifically, it aims to address the problem where retrieval goes wrong and exposes inaccurate knowledge to generative LMs. By estimating and triggering three knowledge retrieval actions discriminately, CRAG enhances the ability of automatic self-correction and efficient utilization of retrieved documents.


#### Check for Hallucinations

In [8]:
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in 'generation' answer."""

    binary_score: str = Field(
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )

llm=ChatOllama(model="llama3.2:3b-instruct-q8_0",temperature=0)
structured_llm_grader=llm.with_structured_output(GradeHallucinations)

system="""
You are a grader assessing whether an LLM generation is grounded in a set of retrieved facts. \n
Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in the set of facts.
"""

prompt=ChatPromptTemplate.from_messages([
    ("system",system),
    ("human","Set of fact <fact>{documents}</fact> \n\n"
    " LLm generation :<generation>{generation}</generation>")
])

hallucination_grader_chain=prompt | structured_llm_grader

respond=hallucination_grader_chain.invoke({"documents":forma_docs(docs_to_use),"generation":generation})
print(respond)

binary_score='yes'


##### Highlight used docs 標出回答用的是那些資訊

In [9]:
from typing import List
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate


class HighlightDocuments(BaseModel):
    """Return the specific part of a document used for answering the question."""

    id: List[str] = Field(
        ...,
        description="List of id of docs used to answers the question"
    )

    title: List[str] = Field(
        ...,
        description="List of titles used to answers the question"
    )

    source: List[str] = Field(
        ...,
        description="List of sources used to answers the question"
    )

    segment: List[str] = Field(
        ...,
        description="List of direct segements from used documents that answers the question"
    )
# LLm
llm=ChatOllama(model="qwen3:4b-instruct-2507-q8_0",temperature=0)

#Parser
parser=PydanticOutputParser(pydantic_object=HighlightDocuments)

#prompt
template = """You are an advanced assistant for document search and retrieval. You are provided with the following:
1. A question.
2. A generated answer based on the question.
3. A set of documents that were referenced in generating the answer.

Your task is to identify and extract the exact inline segments from the provided documents that directly correspond to the content used to
generate the given answer. The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text
in the provided documents.

Ensure that:
- (Important) Each segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the generated answer is clear and directly supports the answer provided.
- (Important) If you didn't used the specific document don't mention it.

Used documents: <docs>{documents}</docs> \n\n User question: <question>{question}</question> \n\n Generated answer: <answer>{generation}</answer>

<format_instruction>
{format_instructions}
</format_instruction>
"""

prompt=PromptTemplate(
    input_variables=["question","documents","generation"],
    template=template,
    partial_variables={"format_instructions":parser.get_format_instructions()},
)

highlight_chain=prompt | llm | parser

respond=highlight_chain.invoke({
    "question":question,"documents":forma_docs(docs_to_use),"generation":generation
})



Pydantic 的特性是：當你建立一個這個模型的物件時，它會自動把所有定義的欄位當作 物件屬性，並且提供驗證、類型檢查等功能。<br>
highlight.id       # 對應 id 欄位<br>
highlight.title    # 對應 title 欄位<br>
highlight.source   # 對應 source 欄位<br>
highlight.segment  # 對應 segment 欄位<br>


In [11]:
for id,title,source,segment in zip(respond.id,respond.title,respond.source,respond.segment):
    print(f"ID : {id} \ntitle : {title} \nsource : {source} \nText segment : {segment}")

ID : doc1 
title : CRAG proposed to improve robustness in RAG-based approaches 
source : This paper studies the problem where RAG-based approaches are challenged if retrieval goes wrong, thereby exposing inaccurate and misleading knowledge to generative LMs. Corrective Retrieval Augmented Generation is proposed to improve the robustness of generation. Essentially, a lightweight retrieval evaluator is to estimate and trigger three knowledge retrieval actions discriminately. With the further leverage of web search and optimized knowledge utilization, CRAG has significantly improved the ability of automatic self-correction and efficient utilization of retrieved documents. 
Text segment : CRAG (Corrective Retrieval Augmented Generation) is proposed to improve the robustness of generation in RAG-based approaches. Specifically, it aims to address the problem where retrieval goes wrong and exposes inaccurate knowledge to generative LMs. By estimating and triggering three knowledge retrieval a