In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

path=r"C:\Users\user\Desktop\暑假語言模型\RAG資料集\Understanding_Climate_Change.pdf"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def replace_t_with_space(list_of_documents):
    """
    將每個文件的內容中的所有制表符（'\t'）替換為空格

    參數:list_of_documents: 一個文件物件的列表，每個物件都有 'page_content' 屬性。

    回傳:替換了制表符為空格後的文件列表。

    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents

def retrieve_context_per_question(question, chunks_query_retriever):
    """
    Retrieves relevant context and unique URLs for a given question using the chunks query retriever.

    Args:
        question: The question for which to retrieve context and URLs.

    Returns:
        A tuple containing:
        - A string with the concatenated content of relevant documents.
        - A list of unique URLs from the metadata of the relevant documents.
    """

    # Retrieve relevant documents for the given question
    docs = chunks_query_retriever.invoke(question, k=2)

    # Concatenate document content
    # context = " ".join(doc.page_content for doc in docs)
    context = [doc.page_content for doc in docs]

    return context



def show_context(context):
    """
    Display the contents of the provided context list.

    Args: context (list): A list of context items to be displayed.

    Prints each context item in the list with a heading indicating its position.
    """
    for i, c in enumerate(context):
        print(f"Context {i + 1}:")
        print(c)
        print("\n")

In [None]:


def encode_PDF(path,token_size=300,token_overlap=50):
    #load pdf
    loader=PyPDFLoader(path)
    document=loader.load()

    #text splits
    text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=token_size,chunk_overlap=token_overlap)
    texts=text_splitter.split_documents(document)

    #清洗文字提升embedding
    cleaned_texts=replace_t_with_space(texts)

    #embedding
    embedding_model=HuggingFaceEmbeddings(model_name="BAAI/bge-small-zh-v1.5")

    # vectorstore
    vectorstore=Chroma.from_documents(embedding=embedding_model,documents=cleaned_texts)

    return vectorstore

vectorstore=encode_PDF(path)

#retriever

retriever=vectorstore.as_retriever(search_kwargs={"k": 2})

#llm
llm=ChatOllama(model="llama3.2:3b-instruct-q8_0",temperature=0)

question="What is the main cause of climate change?"



In [4]:

context = retrieve_context_per_question(question, retriever)
show_context(context)


Context 1:
Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term changes in the global climate. The term 
"global climate" encompasses the planet's overall weather patterns, including temperature, 
precipitation, and wind patterns, over an extended period. Over the past century, human 
activities, particularly the burning of fossil fuels and deforestation, have significantly 
contributed to climate change. 
Historical Context 
The Earth's climate has changed throughout history. Over the past 650,000 years, there have 
been seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about 
11,700 years ago marking the beginning of the modern climate era and human civilization. 
Most of these climate changes are attributed to very small variations in Earth's orbit that 
change the amount of solar energy our planet receives. During the Holocene epoch, which 
began at the end of the last ice age, hu

In [5]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
template="""
Answer the following question based in the context:
{context}

Question:{question}

"""
prompt=ChatPromptTemplate.from_template(template)

rag_chain=(
    prompt
    | llm
    | StrOutputParser()
)
rag_chain.invoke({"context":context,"question":question})

'According to the text, human activities, particularly the burning of fossil fuels and deforestation, have significantly contributed to climate change over the past century.'