### Tools for building the RAG App

Now that we are familiar with the overall architecture, we can now go ahead and structure the tools that we'll use for the upcoming demonstration:

- OpenAI LLM (model - GPT 4o-mini): This will be our primary model for generating the responses
- LangChain: Langchain is a powerful framework for orchestrating different layers in the RAG app. We shall use this to build the retriever end-to-end and also connect with other tools for tasks such as
    - Chunking - RecursiveCharacterTextSplitter
    - Embedding Model - Openapi
    - Vector Search Model - FAISS



In [None]:
from langchain_community.document_loaders  import PyPDFLoader, PyMuPDFLoader , DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import os
from pathlib import Path
from dotenv import load_dotenv

# Importing ChatOpenAI from LangChain to interact with OpenAI's language models,
# such as GPT, for generating responses.
from langchain_openai import ChatOpenAI

# Importing ChatPromptTemplate to create structured prompts for the chatbot,
# ensuring consistent interactions with the AI model.
from langchain_core.prompts import ChatPromptTemplate

# Importing OpenAIEmbeddings to convert text data into numerical vector
# representations for similarity search and retrieval.
from langchain_openai import OpenAIEmbeddings

# Importing create_stuff_documents_chain to combine and process retrieved
# documents for meaningful AI-generated responses.
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

# Importing create_retrieval_chain to build a chain that retrieves relevant
# documents from a vector store and generates AI responses.
from langchain_classic.chains import create_retrieval_chain
from langchain_core.output_parsers import StrOutputParser

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


# -------------------------------
# 1. Load PDF
# -------------------------------

In [2]:

def process_pdf_documents(pdf_directory):
    all_documents = []
    pdf_directory = Path(pdf_directory)
    pdf_files = list(pdf_directory.glob('**/*.pdf'))
    print(f"Found {len(pdf_files)} PDF files in the directory.")
    
    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file}")
        loader = PyMuPDFLoader(str(pdf_file))
        documents = loader.load()
        all_documents.extend(documents)
        print(f"\nTotal document loaded so far: {len(all_documents)}\n")
    return all_documents

#Process the PDF documents in the specified directory
processed_documents = process_pdf_documents("../data")


Found 0 PDF files in the directory.


# -------------------------------
# 2. Split documents into chunks
# -------------------------------

In [3]:
def split_documents_into_chunks(documents, chunk_size=1000, chunk_overlap=500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    split_documents = text_splitter.split_documents(documents)
    print(f"Split..... {len(documents)} documents into.... {len(split_documents)} chunks")

    if split_documents:
        print(f"Sample chunk (first 500 characters):\n{split_documents[0].page_content[:500]}")
        print(f"\nLength of the sample chunk: {len(split_documents[0].metadata)} characters")
    return split_documents

In [4]:
chunks = split_documents_into_chunks(processed_documents)   

Split..... 0 documents into.... 0 chunks


# -------------------------------
# 3. Create embeddings
# -------------------------------

In [5]:
load_dotenv()  # loads .env into environment
assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY not found"
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)
print(embeddings.embed_query("hello world")[:5])

[-0.00676333112642169, -0.03919631987810135, 0.034175805747509, 0.02876211516559124, -0.02478501945734024]


# -------------------------------
# 4. Create or load FAISS index
# -------------------------------

In [None]:
print(f"Number of document chunks: {len(chunks)}")
print(f"Sample chunk text: {chunks[0].page_content[:200]}")
faiss_index_path = "faiss_index"

try:
    vectorstore = FAISS.load_local(faiss_index_path, embeddings)
except:
    vectorstore = FAISS.from_documents(chunks, embeddings)
    vectorstore.save_local(faiss_index_path)

IndexError: list index out of range

# -------------------------------
# 5. Create retriever
# -------------------------------

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# -------------------------------
# 6. Create RAG chain
# -------------------------------

In [None]:
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=0
)

In [None]:

prompt = ChatPromptTemplate.from_template(
    """You are a helpful assistant. Summarize the following document content into a concise, readable summary:
Document Content:
{context}
Summary:
"""
)

# Step 1: create a map-reduce style chain for combining multiple docs
doc_chain = create_stuff_documents_chain(llm, prompt=prompt, output_parser=StrOutputParser())

# Step 2: create the retrieval chain with your retriever
qa_chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=doc_chain)





# -------------------------------
# 7. Ask a question
# -------------------------------

In [None]:
query = "Summarize the document"
response = qa_chain.invoke({"input": query})
print("Summary:", response['answer'])
