### Load all the PDF files from a directory into document structure
##### Two approaches: 1) Manually iterate through files 2) Use DirectoryLoader

In [3]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader

In [None]:
def process_all_pdfs(directory_path):

    pdf_documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            loader = PyMuPDFLoader(file_path)
            docs = loader.load()
            for each_doc in docs:
                each_doc.metadata['file_type'] = 'pdf'
            pdf_documents.extend(docs)
    
    return pdf_documents

def process_all_pdfs_directory_loader(directory_path):

    pdf_directory_loader = DirectoryLoader(
        directory_path,
        glob="**/*.pdf",
        loader_cls=PyMuPDFLoader
    )
    pdf_documents = pdf_directory_loader.load()

    for doc in pdf_documents:
        doc.metadata['file_type'] = 'pdf'

    return pdf_documents

In [None]:
# Process all PDFs in the specified directory in a iterative manner
# all_pdf_documents = process_all_pdfs("../data/pdf_files/")
# Process all PDFs in the specified directory using DirectoryLoader
all_pdf_documents = process_all_pdfs_directory_loader("../data/pdf_files/")
print(all_pdf_documents)

### Text Splitter to perform chunking on the loaded documents

In [None]:
from langchain_text_splitters  import RecursiveCharacterTextSplitter

def text_splitter(documents, chunk_size=1000, chunk_overlap=200):

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""],
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Total document loaders: {len(documents)} and Number of chunks after splitting: {len(split_docs)}")

    return split_docs

In [None]:
text_splitter_documents = text_splitter(all_pdf_documents)
print(text_splitter_documents)

### Embedding and VectorDB Storage

In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np
import chromadb
import uuid

In [8]:
class EmbeddingManager:
    """
    Creates embeddings for given documents using sentence_transformer with a specified embedding model.
    """
    def __init__(self, embedding_model_name="all-miniLM-L6-v2"):
        self.embedding_model_name = embedding_model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Loads the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.embedding_model_name}")
            self.model = SentenceTransformer(self.embedding_model_name)
            print("Model loaded successfully. Embedding dimensions:", self.model.get_sentence_embedding_dimension())
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

    def generate_embeddings(self, page_content_text):

        print(f"Generating embeddings for {len(page_content_text)} documents...")
        embeddings = self.model.encode(page_content_text, show_progress_bar=True)
        print(f"Embeddings generated successfully with shape {embeddings.shape}")
        
        return embeddings
    
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-miniLM-L6-v2
Model loaded successfully. Embedding dimensions: 384


<__main__.EmbeddingManager at 0x27895f64980>

In [6]:
class VectorStoreManager:
    """
    Manages the creation and storage of vector embeddings using FAISS.
    """
    def __init__(self, collection_name="pdf_documents", persist_directory="../data/vector_storage"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.db_client = None
        self._initialize_storage()

    def _initialize_storage(self):
        try:
            # Create a persistent directory for vector storage
            os.makedirs(self.persist_directory, exist_ok=True)
            # Initialize ChromaDB client
            self.db_client = chromadb.PersistentClient(path=self.persist_directory)
            # Create or get the collection
            self.collection = self.db_client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "Collection of PDF document embeddings for RAG application"}
            )
            print(f"Vector storage initialized at: {self.persist_directory}")
            print(f"Existing documents in the collection name {self.collection_name} is {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing storage directory: {e}")
            raise

    def add_documents_to_collection(self, documents, embeddings):
        """Adds documents and their embeddings to the vector store collection."""
        if len(documents) != len(embeddings):
            raise ValueError("The number of documents must match the number of embeddings.")
        
        print(f"Adding {len(documents)} documents to the vector store collection...")

        # Prepare data for insertion to the vector store collection
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for index, (doc, embedding) in enumerate(zip(documents, embeddings)):

            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{index}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)  # Copy existing metadata
            metadata['doc_index'] = index
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )
            print(f"Successfully added {len(documents)} documents to the vector store collection {self.collection_name}.")
            print(f"Total documents in the collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise
vector_store_manager = VectorStoreManager()
vector_store_manager

Vector storage initialized at: ../data/vector_storage
Existing documents in the collection name pdf_documents is 33


<__main__.VectorStoreManager at 0x278957497f0>

In [None]:
# Generate embeddings for the split documents
page_content_text = [doc.page_content for doc in text_splitter_documents]
embeddings_data = embedding_manager.generate_embeddings(page_content_text)

# Store the embeddings to vector store
vector_store_manager.add_documents_to_collection(
    documents=text_splitter_documents,
    embeddings=embeddings_data
)

### Retriever Pipeline from Vector Storage DB

In [9]:
class RAGRetriever:
    """
    Handles query based retrieval frpm the vector store for RAG applications.
    """
    def __init__(self, vector_store_manager, embedding_manager):
        self.vector_store_manager = vector_store_manager
        self.embedding_manager = embedding_manager

    def retrieve(self, query, top_k=5, score_threshold=0.0):

        # Generate the embedding for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in Vector storage with the query embedding
        vector_results = self.vector_store_manager.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )

        # Process the vector result
        processed_results = []

        if vector_results['documents'] and vector_results['documents'][0]:
            print(f"Top {top_k} retrieved documents for the query '{query}':")
            documents = vector_results['documents'][0]
            metadatas = vector_results['metadatas'][0]
            distances = vector_results['distances'][0]
            ids = vector_results['ids'][0]

            for index, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                # Convert distance to similarity score (ChromaDB uses cosine distance, lower is better)
                similarity_score = 1 - distance

                if similarity_score >= score_threshold:
                    processed_results.append({
                        'id': doc_id,
                        'content': document,
                        'metadata': metadata,
                        'similarity_score': similarity_score,
                        'distance': distance,
                        'rank': index + 1
                    })
            print(f"Retrieved {len(processed_results)} results after applying score threshold of {score_threshold}.")
        else:
            print(f"No documents retrieved for the query '{query}'.")

        return processed_results
    
rag_retriever = RAGRetriever(vector_store_manager, embedding_manager)
query = "Concerns about AI"
retrieved_results = rag_retriever.retrieve(query)
retrieved_results

Generating embeddings for 1 documents...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.13it/s]

Embeddings generated successfully with shape (1, 384)
Top 5 retrieved documents for the query 'Concerns about AI':
Retrieved 5 results after applying score threshold of 0.0.





[{'id': 'doc_17ddeb75_11',
  'content': 'Intelligence is also viewed as a great tool for better cybersecurity. Many banks are\nusing AI as a means to identify unauthorized credit cards uses. From analyzing\ncomplex genetic data to perform the most delicate surgeries at the highest precision\nis also being worked on to integrate with AI. We all know about companies like\nTesla and Apple working to make flawless self-driving cars which is going to have\ngame changing impacts on the future of transportation.\nConcerns about AI\nOne of the most immediate concerns about Artificial Intelligence is the fear of\nlosing jobs. Artificial Intelligence enhancing automation is also causing huge job\nlosses around the world. According to a Forbes article, it is predicted that by 2025\nautomation will cause a loss of 85 million jobs. [6] Bigger fears regarding AI\nincludes the scenario whereas machines become smarter and smarter they going to\nend up being as opinionated and biased like some of the p

### Integration VectorDB Context pipeline with LLM output 

In [None]:
# Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq

groq_api_key = os.getenv("GROQ_API_KEY")

llm = ChatGroq(api_key=groq_api_key, model="openai/gpt-oss-20b", temperature=0.1, max_tokens=1024)

def simple_rag(query, retriever, llm, top_k=5):

    # Retrieve relevant documents
    retrieved_docs = retriever.retrieve(query, top_k=top_k)

    # Prepare context for the LLM
    # context = "\n\n".join([f"Document {doc['rank']} (Score: {doc['similarity_score']:.4f}):\n{doc['content']}" for doc in retrieved_docs])
    context = "\n\n".join([f"{doc['content']}" for doc in retrieved_docs]) if retrieved_docs else ""
    if not context:
        return "No relevant documents found to answer the query." 

    # Create the prompt
    prompt = f"Using the following context, answer the question in pointer wise:\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"

    # Get response from LLM
    response = llm.invoke([prompt.format(context=context, query=query)])

    return response

In [15]:
query = "What are the key concerns about AI mentioned in the documents?"
rag_response = simple_rag(query, rag_retriever, llm, top_k=5)
print(rag_response)

Generating embeddings for 1 documents...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s]


Embeddings generated successfully with shape (1, 384)
Top 5 retrieved documents for the query 'What are the key concerns about AI mentioned in the documents?':
Retrieved 5 results after applying score threshold of 0.0.
content='**Key concerns about AI highlighted in the documents**\n\n- **Job displacement**  \n  - Automation is projected to eliminate up to 85\u202fmillion jobs by 2025.  \n  - Fear that AI will replace human workers across many sectors.\n\n- **Bias and opinion‑laden decision‑making**  \n  - AI systems can inherit and amplify the biases present in their training data.  \n  - Machines may become “opinionated” and unfair, mirroring the prejudices of their creators.\n\n- **Weaponization and misuse**  \n  - Autonomous weapons could target individuals based on pre‑programmed instructions.  \n  - Potential for governments, criminal groups, or rogue AI to deploy lethal autonomous systems.\n\n- **Myths and misinformation**  \n  - Over‑exaggerated fears of AI becoming a super‑int

### Enhanced RAG Pipeline Features

In [None]:
def rag_advanced(query, retriever, llm, top_k=5, score_threshold=0.2, return_context=True):
    """
    RAG Pipeline with extra features:
    - Returns answer, sources, confidence score
    """

    # Retrieve relevant documents
    retrieved_docs = retriever.retrieve(query, top_k=top_k, score_threshold=score_threshold)

    if not retrieved_docs:
        return "No relevant documents found to answer the query."

    # Prepare context for the LLM
    context = "\n\n".join([f"{doc['content']}" for doc in retrieved_docs]) if retrieved_docs else ""

    sources = [
        {
            'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
            'page': doc['metadata'].get('page', 'unknown'),
            'similarity_score': doc['similarity_score']
        }
        for doc in retrieved_docs
    ]
    confidence_scores = max([doc['similarity_score'] for doc in retrieved_docs])

    # Create the prompt
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""

    # Get response from LLM
    response = llm.invoke([prompt.format(context=context, query=query)])
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence_score': confidence_scores
    }
    if return_context:
        output['context'] = context

    return output

query = "What are the key concerns about AI mentioned in the documents?"
result = rag_advanced(query, rag_retriever, llm, top_k=5, score_threshold=0.2, return_context=True)
print('Answer:', result['answer'])
print('Sources:', result['sources'])
print('Confidence Score:', result['confidence_score'])


Generating embeddings for 1 documents...


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.86it/s]

Embeddings generated successfully with shape (1, 384)
Top 5 retrieved documents for the query 'What are the key concerns about AI mentioned in the documents?':
Retrieved 1 results after applying score threshold of 0.2.





Answer: Key concerns about AI mentioned are:  
1. **Job displacement** – automation could eliminate millions of jobs (e.g., 85 million by 2025).  
2. **Bias and opinionation** – as AI systems learn from human data, they risk becoming biased or opinionated in ways that mirror their trainers.
Sources: [{'source': '..\\data\\pdf_files\\A_Brief_Introduction_To_AI.pdf', 'page': 5, 'similarity_score': 0.20004886388778687, 'metadata': {'trapped': '', 'file_type': 'pdf', 'doc_index': 11, 'keywords': '', 'creationDate': "D:20210505130455+01'00'", 'source': '..\\data\\pdf_files\\A_Brief_Introduction_To_AI.pdf', 'total_pages': 9, 'file_path': '..\\data\\pdf_files\\A_Brief_Introduction_To_AI.pdf', 'format': 'PDF 1.4', 'creator': '', 'creationdate': '2021-05-05T13:04:55+01:00', 'producer': 'Skia/PDF m92 Google Docs Renderer', 'title': 'A Brief Introduction to Artificial Intelligence', 'page': 5, 'modDate': "D:20210505130455+01'00'", 'author': '', 'moddate': '2021-05-05T13:04:55+01:00', 'content_len