# RAG With 10 Documents

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import os

In [18]:
all_docs = []

In [20]:
data_dir = os.path.join(os.getcwd(), "Data2")

In [21]:
total_page = 0
for i in range(1, 11):
    file_path = os.path.join(data_dir, f"source{i}.pdf")
    if os.path.exists(file_path):
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        all_docs.extend(documents)
        print(f"Loaded source{i}.pdf - {len(documents)} pages")
        total_page += len(documents)
    else:
        print(f"source{i}.pdf not found")




Loaded source1.pdf - 11 pages
Loaded source2.pdf - 6 pages
Loaded source3.pdf - 20 pages
Loaded source4.pdf - 68 pages
Loaded source5.pdf - 11 pages
Loaded source6.pdf - 12 pages
Loaded source7.pdf - 19 pages
Loaded source8.pdf - 81 pages
Loaded source9.pdf - 28 pages
Loaded source10.pdf - 31 pages


In [22]:
total_page

287

In [23]:
my_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

In [24]:
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embedding_model=GoogleGenerativeAIEmbeddings(model="models/embedding-001")

my_docs_split = my_text_splitter.split_documents(all_docs)
myvectorstore = FAISS.from_documents(my_docs_split, embedding_model)

In [25]:
result = myvectorstore.similarity_search("Evolution of Long context in LLM?",k=5)

In [28]:
result[0].page_content

'in natural language processing, ” ACM Computing Surveys , vol. 55, no. 9,\npp. 1–35, 2023.'

In [29]:
from langchain.prompts import PromptTemplate


my_prompt_template = """
        Answer the question based on the context provided below. 
        If the context does not contain sufficient information, respond with: 
        "I do not have enough information about this."

        Context: {context}

        Question: {question}

        Answer:"""


my_prompt = PromptTemplate(
    template=my_prompt_template,
    input_variables=["context", "question"]
)



In [30]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()
my_retriever = myvectorstore.as_retriever()


In [32]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv

load_dotenv()

my_llm = ChatGroq(model="deepseek-r1-distill-llama-70b")

In [33]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [34]:
from langchain_core.runnables import RunnablePassthrough


my_rag_chain = (
    {"context":my_retriever | format_docs, "question":RunnablePassthrough()}
     | my_prompt
     | my_llm
     | parser
     
)


my_rag_chain.invoke("Tell me about Common prompt injection attacks?")

'<think>\nOkay, so the user is asking about common prompt injection attacks. I need to figure out how to answer this based on the context provided. Let me look through the context again. \n\nThe context includes a table of contents from the AWS Prescriptive Guidance document. It mentions that the document covers "Common attacks" on page 3. However, the provided context doesn\'t include the actual content from page 3. It only gives the table of contents and some introductory information.\n\nSince the context doesn\'t have the details about specific types of prompt injection attacks, I can\'t provide a detailed answer. The best I can do is acknowledge that while the document covers common attacks, the information isn\'t present here. So, I should respond that I don\'t have enough information on this topic.\n</think>\n\nI do not have enough information about this.'