### RAG PIPELINE - DATA INGESTION TO VECTORDB PIPELINE

In [13]:
import os
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [14]:
### Read all PDF files in a directory
def process_pdfs_in_directory(pdf_directory):
    """Process all PDF files in the directory and return text chunks."""
    all_documents = []
    pdf_dir=Path(pdf_directory)

    #Find all PDF files recursively
    pdf_files = list(pdf_dir.rglob("*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process.")

    for pdf_file in pdf_files:
        print(f"\nProcessing file: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()
            
            #Add source information to metadata
            for doc in documents:
                doc.metadata["source-file"] = pdf_file.name
                doc.metadata['file-tyepe']='pdf'
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages from {pdf_file.name}.")
        except Exception as e:
            print(f"Error loading {pdf_file.name}: {e}")    
        
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents    
        
#process_pdfs_in_directory
all_pdfs_documents = process_pdfs_in_directory("../data/pdf")        

Found 5 PDF files to process.

Processing file: attention.pdf
Loaded 1 pages from attention.pdf.

Processing file: embeddings.pdf
Loaded 27 pages from embeddings.pdf.

Processing file: objectdetection.pdf
Loaded 21 pages from objectdetection.pdf.

Processing file: proposal.pdf
Loaded 9 pages from proposal.pdf.

Processing file: research_attention.pdf
Loaded 7 pages from research_attention.pdf.

Total documents loaded: 65


In [15]:
all_pdfs_documents

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-12-17T05:34:50+00:00', 'source': '..\\data\\pdf\\attention.pdf', 'file_path': '..\\data\\pdf\\attention.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': '(anonymous)', 'author': '(anonymous)', 'subject': '(unspecified)', 'keywords': '', 'moddate': '2025-12-17T05:34:50+00:00', 'trapped': '', 'modDate': "D:20251217053450+00'00'", 'creationDate': "D:20251217053450+00'00'", 'page': 0, 'source-file': 'attention.pdf', 'file-tyepe': 'pdf'}, page_content='This document explains the attention mechanism used in modern deep learning models. Attention\nallows neural networks to dynamically focus on the most relevant parts of the input data. It\nsignificantly improves performance in tasks such as machine translation, text summarization, and\nquestion answering. Self-attention is the core idea behind Transformer models like BERT and GPT,\nenabling parallel processing an

In [16]:
### TEXT SPLITTING ###
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split into {len(documents)} documents into {len(split_docs)} chunks.")

    #show example of chunks
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs    

    

In [17]:
chunks=split_documents(all_pdfs_documents)
chunks

Split into 65 documents into 351 chunks.

Example chunk:
Content: This document explains the attention mechanism used in modern deep learning models. Attention
allows neural networks to dynamically focus on the most relevant parts of the input data. It
significantly...
Metadata: {'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-12-17T05:34:50+00:00', 'source': '..\\data\\pdf\\attention.pdf', 'file_path': '..\\data\\pdf\\attention.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': '(anonymous)', 'author': '(anonymous)', 'subject': '(unspecified)', 'keywords': '', 'moddate': '2025-12-17T05:34:50+00:00', 'trapped': '', 'modDate': "D:20251217053450+00'00'", 'creationDate': "D:20251217053450+00'00'", 'page': 0, 'source-file': 'attention.pdf', 'file-tyepe': 'pdf'}


[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-12-17T05:34:50+00:00', 'source': '..\\data\\pdf\\attention.pdf', 'file_path': '..\\data\\pdf\\attention.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': '(anonymous)', 'author': '(anonymous)', 'subject': '(unspecified)', 'keywords': '', 'moddate': '2025-12-17T05:34:50+00:00', 'trapped': '', 'modDate': "D:20251217053450+00'00'", 'creationDate': "D:20251217053450+00'00'", 'page': 0, 'source-file': 'attention.pdf', 'file-tyepe': 'pdf'}, page_content='This document explains the attention mechanism used in modern deep learning models. Attention\nallows neural networks to dynamically focus on the most relevant parts of the input data. It\nsignificantly improves performance in tasks such as machine translation, text summarization, and\nquestion answering. Self-attention is the core idea behind Transformer models like BERT and GPT,\nenabling parallel processing an

### embedding and vectorStoreDB


In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
class EmbeddingManager:

    """Handle embedding generation and storage using SentenceTransformers and ChromaDB."""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """Initialize the embeddingManager.
        Args:
         model_name: huggingface model name for SentenceEmbeddings  .
        """

        self.model_name = model_name
        self.model = None
        self.load_model()
    
    def load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print("Model loaded successfully. Embedding dimension:", self.model.get_sentence_embedding_dimension())
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts.
        Args:
         texts: List of strings to embed. 
         Returns:
          Numpy array of embeddings with shape (len(texts), embedding_dimension).""" 
        if not self.model:
             raise ValueError("Model not loaded. Call load_model() first.")
        print(f"Generating embeddings for {len(texts)} texts.")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print("Embeddings generated with shape:", embeddings.shape)
        return embeddings
    # def get_embedding_dimension(self) -> int:          ### Auto calculaye either we can make a entire function  getting embeddings of this  ###
    #     """Get the dimension of the embeddings produced by the model.
    #     Returns:
    #      Embedding dimension as an integer."""
    #     if not self.model:
    #         raise ValueError("Model not loaded. Call load_model() first.")
    #     return self.model.get_sentence_embedding_dimension()

## Intialise  the embedding manager
embedding_manager = EmbeddingManager()  # Default model is "all-MiniLM-L6-v2"
embedding_manager


        

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x2056d8d6ba0>

### VECTORDB

In [7]:

import os
class VectorStore:
    """Manages document embeddings and similarity search using ChromaDB."""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the VectorStore.

        Args:
         collection_name: Name of the ChromaDB collection.
         persist_directory: Directory to persist ChromaDB data.
         """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """
        Initialize ChromaDB client and collection.
        """
        try:
            # Create persist chromadb client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

           # Create or get collection
            self.collection = self.client.get_or_create_collection(
               name=self.collection_name,
               metadata={"description": "PDF Document Embeddings Collection for RAG"}
            )
            print(f"Vector store initialized with collection: {self.collection_name}")
            print(f"Existing documents in store: {self.collection.count()}")
        
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise e
        
    def add_documents(self, documents: List[Any], embeddings: np):  
        """
        Add documents and their embeddings to the vector store.

        Args:
         documents: List of document objects with 'page_content' and 'metadata'.
         embeddings: embeddings corresponding to the documents.
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")  
        
        print(f"Adding {len(documents)} documents to the vector store.")
        
        # Prepare data for chromadb
        ids=[]
        metadatas=[]
        documents_texts=[]
        embedding_texts=[]

        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
            # Generate a unique ID for each document
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #prepare metadata
            metadata=dict(doc.metadata)  # Copy existing metadata
            metadata['doc_index']=i
            metadata['content_length']=len(doc.page_content)
            metadatas.append(metadata)

            #Document text
            documents_texts.append(doc.page_content)

            #Embedding 
            embedding_texts.append(embedding.tolist())  # Convert numpy array to list

        #Add to  collection
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_texts,
                embeddings=embedding_texts
            )
            print(f"Successfully added {len(documents)} documents to the vector store.")
            print(f"Total documents in store after addition: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise e

Vectorstore=VectorStore()
Vectorstore        
           

Vector store initialized with collection: pdf_documents
Existing documents in store: 0


<__main__.VectorStore at 0x20551491e80>

In [18]:
chunks

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-12-17T05:34:50+00:00', 'source': '..\\data\\pdf\\attention.pdf', 'file_path': '..\\data\\pdf\\attention.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': '(anonymous)', 'author': '(anonymous)', 'subject': '(unspecified)', 'keywords': '', 'moddate': '2025-12-17T05:34:50+00:00', 'trapped': '', 'modDate': "D:20251217053450+00'00'", 'creationDate': "D:20251217053450+00'00'", 'page': 0, 'source-file': 'attention.pdf', 'file-tyepe': 'pdf'}, page_content='This document explains the attention mechanism used in modern deep learning models. Attention\nallows neural networks to dynamically focus on the most relevant parts of the input data. It\nsignificantly improves performance in tasks such as machine translation, text summarization, and\nquestion answering. Self-attention is the core idea behind Transformer models like BERT and GPT,\nenabling parallel processing an

In [20]:
### Convert chunks to texts for embedding generation ###
text=[doc.page_content for doc in chunks]
text

['This document explains the attention mechanism used in modern deep learning models. Attention\nallows neural networks to dynamically focus on the most relevant parts of the input data. It\nsignificantly improves performance in tasks such as machine translation, text summarization, and\nquestion answering. Self-attention is the core idea behind Transformer models like BERT and GPT,\nenabling parallel processing and better context understanding.',
 'Speech and Language Processing.\nDaniel Jurafsky & James H. Martin.\nCopyright ¬© 2025.\nAll\nrights reserved.\nDraft of August 24, 2025.\nCHAPTER\n5\nEmbeddings\nËçÉËÄÖÊâÄ‰ª•Âú®È±ºÔºåÂæóÈ±ºËÄåÂøòËçÉNets are for Ô¨Åsh;\nOnce you get the Ô¨Åsh, you can forget the net.\nË®ÄËÄÖÊâÄ‰ª•Âú®ÊÑèÔºåÂæóÊÑèËÄåÂøòË®ÄWords are for meaning;\nOnce you get the meaning, you can forget the words\nÂ∫ÑÂ≠ê(Zhuangzi), Chapter 26\nThe asphalt that Los Angeles is famous for occurs mainly on its freeways. But\nin the middle of the city is another patch of asphalt, t

In [23]:
### Generate embeddings for the chunks and add to vector store ###
embeddings=embedding_manager.generate_embeddings(text)

## Store in vector store ###
Vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 351 texts.


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:26<00:00,  2.42s/it]


Embeddings generated with shape: (351, 384)
Adding 351 documents to the vector store.
Successfully added 351 documents to the vector store.
Total documents in store after addition: 351


In [26]:
class RAGRetriever:

    """Handles query based retrieval from the vector store."""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the RAGRetriever.

        Args:
         vector_store: Evector Store containing document embeddings.
         embedding_manager:Manager for generating query embeddings.
         
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
      

    def retrieve(self, query: str, top_k: int = 5,score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve top_k similar documents for the given query.

        Args:
        query: The input query string.
        top_k: Number of top similar documents to retrieve.
        score_threshold: Minimum similarity score to consider a document relevant.

        Returns:
         List of dictionaries containing retrieved document information.
        """
        print(f"Retrieving top {top_k} documents for query: '{query}'")
        print(f"Top_k: {top_k}, Score Threshold: {score_threshold} ")
        # Generate embedding for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # search in the vector store
        try:
            results = self.vector_store.collection.query(
               query_embeddings=[query_embedding.tolist()],
               n_results=top_k
        )

         # Process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents=results['documents'][0]
                metadatas=results['metadatas'][0]
                distances=results['distances'][0]
                ids=results['ids'][0]

                for i,(doc_id,document,metadata,distance) in enumerate(zip(ids,documents,metadatas,distances)):
                    similarity_score=1 - distance  # Convert distance to similarity score(chromadb uses cosine distance)
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "document": document,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            "distance": distance,
                            "rank": i+1
                        })
                print(f"Retrieved {len(retrieved_docs)} documents after applying filters.")
            else:
                print("No documents found for the given query.")

            return retrieved_docs    
        except Exception as e:
                print(f"Error during retrieval: {e}")
                return []
        

rag_retriever = RAGRetriever(Vectorstore, embedding_manager) 


In [27]:
rag_retriever.retrieve("What is attention is all you need")

Retrieving top 5 documents for query: 'What is attention is all you need'
Top_k: 5, Score Threshold: 0.0 
Generating embeddings for 1 texts.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 28.64it/s]

Embeddings generated with shape: (1, 384)
Retrieved 5 documents after applying filters.





[{'id': 'doc_aa2e9195_350',
  'document': 'Awareness. \n \nSummary \nThe importance of the role of attention is recognized not only by psychologists \nbut also by academicians and people involved in knowledge management. \nAttention is needed to effectively deal with the huge amount of sensory \ninformation. The more we have sustained attention; it is likely that we could \nsucceed in our life.',
  'metadata': {'keywords': '',
   'moddate': '2018-06-28T14:14:51+05:30',
   'producer': 'Microsoft¬Æ Word 2010',
   'page': 6,
   'creator': 'Microsoft¬Æ Word 2010',
   'source': '..\\data\\pdf\\research_attention.pdf',
   'total_pages': 7,
   'trapped': '',
   'modDate': "D:20180628141451+05'30'",
   'doc_index': 350,
   'subject': '',
   'creationDate': "D:20180628141451+05'30'",
   'creationdate': '2018-06-28T14:14:51+05:30',
   'content_length': 347,
   'file-tyepe': 'pdf',
   'file_path': '..\\data\\pdf\\research_attention.pdf',
   'format': 'PDF 1.5',
   'author': 'Navin',
   'source-fi

In [28]:
rag_retriever.retrieve("CNN based deep feature extraction.")

Retrieving top 5 documents for query: 'CNN based deep feature extraction.'
Top_k: 5, Score Threshold: 0.0 
Generating embeddings for 1 texts.


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  3.13it/s]

Embeddings generated with shape: (1, 384)
Retrieved 3 documents after applying filters.





[{'id': 'doc_cd48f227_131',
  'document': 'loss) via the stochastic gradient descent (SGD) method. The\ntypical VGG16 has totally 13 convolutional (conv) layers, 3\nfully connected layers, 3 max-pooling layers and a softmax\nclassiÔ¨Åcation layer. The conv feature maps are produced by\nconvoluting 3*3 Ô¨Ålter windows, and feature map resolutions\nare reduced with 2 stride max-pooling layers. An arbitrary test\nimage of the same size as training samples can be processed\nwith the trained network. Re-scaling or cropping operations\nmay be needed if different sizes are provided [6].\nThe advantages of CNN against traditional methods can be\nsummarised as follows.\n‚Ä¢ Hierarchical feature representation, which is the multi-\nlevel representations from pixel to high-level semantic fea-\ntures learned by a hierarchical multi-stage structure [15],\n[53], can be learned from data automatically and hidden\nfactors of input data can be disentangled through multi-level\nnonlinear mappings.\n‚Ä¢ 