In [3]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "syllabus.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()

In [4]:
## lazy load
pages = []
for doc in loader.lazy_load():
    pages.append(doc)
len(pages)

34

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter=RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=50
)

In [6]:
split_pages=splitter.split_documents(pages)

In [7]:
len(split_pages)

81

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [11]:
index=faiss.IndexFlatL2(384)

vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [12]:
vector_store.add_documents(documents=split_pages)

['15db9648-967e-4159-89ab-f764af1598ea',
 'aa81f96f-26ad-4f94-aeb8-7e0b100a7d2c',
 '58a02ff3-1d91-434e-b86a-c88c3cc0c4d9',
 '93d30a8f-c89a-4f4d-8458-5f1c5de331e2',
 '21a65fae-cfdb-4f39-9780-8f55770a0da7',
 '8baacd66-59e3-4e5b-8a18-6ecbddc54a18',
 '89e4a575-49d3-435f-8287-4ebe7805d513',
 '9493c4dd-5854-46c3-bf38-b6ed130ff1f5',
 'b7670a57-b447-4090-bc53-3ed242798f0b',
 '1ed53dec-cbae-46ff-85ff-4129193607f5',
 '30db158e-f9d4-4b02-97f9-f2810f68a404',
 '790290c4-6219-4cd1-be7e-e5d28919cfcd',
 'd3f2a326-10cb-461c-92ac-d692f3dfd3d7',
 'cca0e4d6-7430-41af-af8f-4140ce9e25ca',
 'dc39d08c-1e68-4ee0-9572-d93f4c19f7d1',
 '1f71ba54-cdd1-4ae7-95d6-e6892d930042',
 'eb6844dc-fd7b-4a36-b150-4fa9116fc3f9',
 '5e933cb7-fc36-40cc-9e75-d4b78a886816',
 'e5e891bd-b872-4d2e-9c0f-c3e4f0fe23d4',
 '328c3182-6fd4-4737-9753-60142d4f16b6',
 '781e0b05-9b8a-4cd1-a0f1-62a8f2651ef9',
 'c2e255a7-a47c-40d5-be60-582905d9127c',
 '5a66be3d-0790-4ee0-a002-a86f431245dd',
 'a3bc09cf-7811-4d06-81be-430abe3f98e8',
 '175bcae2-6190-

#### create retriver

In [16]:
retriever=vector_store.as_retriever(
    search_kwargs={"k":5}
)

In [18]:
retriever.invoke("Langchain")

## NOTE: if match result is 3 then 2 documents will be added to match k=5

[Document(id='01904229-1b17-4cb4-b380-40e38ddfb227', metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-01-30T20:27:03+00:00', 'title': 'Ultimate Data Science & GenAI Bootcamp', 'moddate': '2025-01-30T20:26:59+00:00', 'keywords': 'DAGdmhcqnYw,BAEmsmap8Lg,0', 'author': 'monal singh', 'containsaigeneratedcontent': 'Yes', 'source': 'syllabus.pdf', 'total_pages': 34, 'page': 31, 'page_label': '32'}, page_content='various use cases.\nIntroduction to Retrieval-Augmented\nGeneration (RAG)\nTopics\nOverview of Retrieval-Augmented\nGeneration (RAG)\nWhat is RAG?, Key Components of a\nRAG System, Why RAG is Important for\nAdvanced AI Systems\nUnderstanding the End-to-End RAG\nPipeline\nOverview of the RAG Workflow, Data\nRetrieval, Contextualization, and\nGeneration Phases, Challenges and\nOpportunities in RAG\nIntegrating LangChain in RAG Introduction to LangChain Framework,\nBuilding End-to-End RAG Pipelines with\nLangChain'),
 Document(id='181d6bb7-a65a-4e57-b173-c4b4de2

### create model

In [19]:
from langchain_google_genai import ChatGoogleGenerativeAI

model=ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [20]:
from langchain import hub
prompt=hub.pull("rlm/rag-prompt")



In [21]:
prompt.messages

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]

In [22]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [38]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [30]:
rag_chain=(
    {"context": retriever | format_docs, "question": RunnablePassthrough()} |
    prompt | model | StrOutputParser()
)

In [34]:
rag_chain.invoke("tell me about langchain")

'LangChain is a framework used to build end-to-end Retrieval-Augmented Generation (RAG) pipelines.  It integrates with vector databases and LLMs to enhance language generation tasks.  The framework is used in implementing advanced RAG systems for various use cases.'

In [37]:
rag_chain.invoke("langchain")

'LangChain is a framework used to build end-to-end Retrieval-Augmented Generation (RAG) pipelines.  It integrates with vector databases and LLMs to enhance language generation tasks.  The provided text details its use in building RAG systems.'