###Data Ingestioon to vector DB -RAG PIPELINE

In [8]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [9]:
##read all the pdfs
def process_all_pdfs(pdf_directory):
    """"Process all the pdfs in the directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    #find all pdfs recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                page_num = doc.metadata.get("page", 0) + 1
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf' 
                doc.metadata['page_label'] = f"Page {page_num}"

            all_documents.extend(documents)
            print(f"loaded {len(documents)} pages")

        except Exception as e:
            print(f" Error, failed to load: {e}")

    print(f"total documents loaded: {len(all_documents)}")
    return all_documents
#process all the pdfs
all_pdf_documents = process_all_pdfs("../data")


found 3 PDF files to process

Processing: bheeni resume.pdf
loaded 1 pages

Processing: DAA U-4.pdf
loaded 132 pages

Processing: suyash_res (9) (1).pdf
loaded 1 pages
total documents loaded: 134


In [10]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-09T06:06:33+00:00', 'source': '..\\data\\pdf\\bheeni resume.pdf', 'file_path': '..\\data\\pdf\\bheeni resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-12-09T06:06:33+00:00', 'trapped': '', 'modDate': 'D:20251209060633Z', 'creationDate': 'D:20251209060633Z', 'page': 0, 'source_file': 'bheeni resume.pdf', 'file_type': 'pdf', 'page_label': 'Page 1'}, page_content='Bheeni Agarwal\n\x83 +91-7068082628\n—\n# bheeniagarwal07@gmail.com\n—\nï LinkedIn\n—\n§ GitHub\n—\nLeetCode\n—\nPortfolio\nProfessional Summary\nB.Tech CSE Data Science student with strong foundation in Machine Learning, Statistical Analysis, and Data-driven Problem\nSolving. Proficient in Python, Java, and ML frameworks with hands-on experience building predictive models and data\nvisualization projects. Completed 3+ certified data science pro

In [11]:
###chunking

def split_documents(documents, chunk_size = 300, chunk_overlap = 200):
    """split documents into smaller chunks for better rag performance"""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
##show exaple of a chunk
    if split_docs:
        print(f"\nexample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadta: {split_docs[0].metadata}")

    return split_docs

In [12]:
chunks = split_documents(all_pdf_documents)
chunks

Split 134 documents into 59 chunks

example chunk:
Content: Bheeni Agarwal
 +91-7068082628
—
# bheeniagarwal07@gmail.com
—
ï LinkedIn
—
§ GitHub
—
LeetCode
—
Portfolio
Professional Summary
B.Tech CSE Data Science student with strong foundation in Machine Lear...
Metadta: {'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-09T06:06:33+00:00', 'source': '..\\data\\pdf\\bheeni resume.pdf', 'file_path': '..\\data\\pdf\\bheeni resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-12-09T06:06:33+00:00', 'trapped': '', 'modDate': 'D:20251209060633Z', 'creationDate': 'D:20251209060633Z', 'page': 0, 'source_file': 'bheeni resume.pdf', 'file_type': 'pdf', 'page_label': 'Page 1'}


[Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-09T06:06:33+00:00', 'source': '..\\data\\pdf\\bheeni resume.pdf', 'file_path': '..\\data\\pdf\\bheeni resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-12-09T06:06:33+00:00', 'trapped': '', 'modDate': 'D:20251209060633Z', 'creationDate': 'D:20251209060633Z', 'page': 0, 'source_file': 'bheeni resume.pdf', 'file_type': 'pdf', 'page_label': 'Page 1'}, page_content='Bheeni Agarwal\n\x83 +91-7068082628\n—\n# bheeniagarwal07@gmail.com\n—\nï LinkedIn\n—\n§ GitHub\n—\nLeetCode\n—\nPortfolio\nProfessional Summary\nB.Tech CSE Data Science student with strong foundation in Machine Learning, Statistical Analysis, and Data-driven Problem'),
 Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-09T06:06:33+00:00', 'source': '..\\data\\pdf\\bheeni resume.pdf', 'fi

### embedding and vector store db

In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformers"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        
        Initialize the embedding manager

        Args:
            model_name: it is a beginner friendly model available on hugging face platform, which is basically the github for AI devs

        """

        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the sentence transformer model"""

        try:
            print(f"loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:

        """
        Generate embeddingsfor a list of texts

        Args:
            texts: List of text stringsto embed

        Returns:
        numpy array of embeddings with shape (len(ttexts), embedding_dim)
        """

        if not self.model:
            raise ValueError("model not loaded")
        
        print(f"generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
embedding_manager=EmbeddingManager()
embedding_manager

loading embedding model: all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 329.08it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded successfully. embedding dimension: 384


<__main__.EmbeddingManager at 0x177e99da120>

In [15]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore
    

    

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 236


<__main__.VectorStore at 0x177eb241550>

In [16]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-09T06:06:33+00:00', 'source': '..\\data\\pdf\\bheeni resume.pdf', 'file_path': '..\\data\\pdf\\bheeni resume.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-12-09T06:06:33+00:00', 'trapped': '', 'modDate': 'D:20251209060633Z', 'creationDate': 'D:20251209060633Z', 'page': 0, 'source_file': 'bheeni resume.pdf', 'file_type': 'pdf', 'page_label': 'Page 1'}, page_content='Bheeni Agarwal\n\x83 +91-7068082628\n—\n# bheeniagarwal07@gmail.com\n—\nï LinkedIn\n—\n§ GitHub\n—\nLeetCode\n—\nPortfolio\nProfessional Summary\nB.Tech CSE Data Science student with strong foundation in Machine Learning, Statistical Analysis, and Data-driven Problem'),
 Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-09T06:06:33+00:00', 'source': '..\\data\\pdf\\bheeni resume.pdf', 'fi

In [17]:
print(type(vectorstore))

<class '__main__.VectorStore'>


In [18]:
###  convertig text to embeddingss
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
# 'chunks' should be a list of Document objects
vectorstore.add_documents(chunks, embeddings)

generating embeddings for 59 texts...


Batches: 100%|██████████| 2/2 [00:00<00:00,  2.15it/s]


Generated embeddings with shape: (59, 384)
Adding 59 documents to vector store...
Successfully added 59 documents to vector store
Total documents in collection: 295


### Retriever Pipeline From VectorStore

In [19]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vectorstore: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score , for checking the answer (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)



In [20]:
rag_retriever.retrieve("ssist in organizing technical workshops mock interviews, and resume building sessions for career preparation Facilitate communication between recruiters and students improving placement proce")

Retrieving documents for query: 'ssist in organizing technical workshops mock interviews, and resume building sessions for career preparation Facilitate communication between recruiters and students improving placement proce'
Top K: 5, Score threshold: 0.0
generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 40.47it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_3f75ad75_30',
  'content': '• Assist in organizing technical workshops, mock interviews, and resume building sessions for career preparation\n• Facilitate communication between recruiters and students improving placement process efficiency by 30%\nTeam Footprints, AKGEC\nMay 2025 – Present\nCreative Artist and Designer',
  'metadata': {'creationdate': '2025-12-09T06:06:33+00:00',
   'doc_index': 30,
   'trapped': '',
   'page': 0,
   'page_label': 'Page 1',
   'author': '',
   'title': '',
   'format': 'PDF 1.5',
   'content_length': 288,
   'moddate': '2025-12-09T06:06:33+00:00',
   'creationDate': 'D:20251209060633Z',
   'modDate': 'D:20251209060633Z',
   'source_file': 'bheeni resume.pdf',
   'file_type': 'pdf',
   'creator': 'LaTeX with hyperref',
   'total_pages': 1,
   'subject': '',
   'source': '..\\data\\pdf\\bheeni resume.pdf',
   'file_path': '..\\data\\pdf\\bheeni resume.pdf',
   'producer': 'pdfTeX-1.40.27',
   'keywords': ''},
  'similarity_score': 0.34317195

#RAG PIPELINE- VECTOR DB TO LLM OUTPUT GENERATION

In [21]:
import os
from dotenv import load_dotenv
load_dotenv()

print(os.getenv("GROQ_API_KEY"))

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


In [22]:
# Updated modern imports
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage

In [23]:
class GroqLLM:
    def __init__(self, model_name: str = "gemma2-9b-it", api_key: str =None):
        """
        Initialize Groq LLM
        
        Args:
            model_name: Groq model name (qwen2-72b-instruct, llama3-70b-8192, etc.)
            api_key: Groq API key (or set GROQ_API_KEY environment variable)
        """
        self.model_name = model_name
        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
        
        if not self.api_key:
            raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable or pass api_key parameter.")
        
        self.llm = ChatGroq(
            groq_api_key=self.api_key,
            model_name=self.model_name,
            temperature=0.1,
            max_tokens=1024
        )
        
        print(f"Initialized Groq LLM with model: {self.model_name}")

    def generate_response(self, query: str, context: str, max_length: int = 500) -> str:
        """
        Generate response using retrieved context
        
        Args:
            query: User question
            context: Retrieved document context
            max_length: Maximum response length
            
        Returns:
            Generated response string
        """
        
        # Create prompt template
        prompt_template = PromptTemplate(
            input_variables=["context", "question"],
            template="""You are a helpful AI assistant. Use the following context to answer the question accurately and concisely.

Context:
{context}

Question: {question}

Answer: Provide a clear and informative answer based on the context above. If the context doesn't contain enough information to answer the question, say so."""
        )
        
        # Format the prompt
        formatted_prompt = prompt_template.format(context=context, question=query)
        
        try:
            # Generate response
            messages = [HumanMessage(content=formatted_prompt)]
            response = self.llm.invoke(messages)
            return response.content
            
        except Exception as e:
            return f"Error generating response: {str(e)}"
        
    def generate_response_simple(self, query: str, context: str) -> str:
        """
        Simple response generation without complex prompting
        
        Args:
            query: User question
            context: Retrieved context
            
        Returns:
            Generated response
        """
        simple_prompt = f"""Based on this context: {context}

Question: {query}

Answer:"""
        
        try:
            messages = [HumanMessage(content=simple_prompt)]
            response = self.llm.invoke(messages)
            return response.content
        except Exception as e:
            return f"Error: {str(e)}"
    


In [24]:
# Initialize Groq LLM (you'll need to set GROQ_API_KEY environment variable)
try:
    groq_llm = GroqLLM(api_key=os.getenv("GROQ_API_KEY"))
    print("Groq LLM initialized successfully!")
except ValueError as e:
    print(f"Warning: {e}")
    print("Please set your GROQ_API_KEY environment variable to use the LLM.")
    groq_llm = None

Initialized Groq LLM with model: gemma2-9b-it
Groq LLM initialized successfully!


In [25]:
### get the context from the retriever and pass it to the LLM

rag_retriever.retrieve("Unified Multi-task Learning Framework")

Retrieving documents for query: 'Unified Multi-task Learning Framework'
Top K: 5, Score threshold: 0.0
generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 21.56it/s]

Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering)





[]

### Integration Vectordb Context pipeline With LLM output

In [28]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key,model_name="gemma2-9b-it",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [34]:
answer=rag_simple("Project",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'Project'
Top K: 3, Score threshold: 0.0
generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 17.18it/s]

Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering)
No relevant context found to answer the question.





### Enhanced RAG Pipeline Features

In [35]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("Hard Negative Mining Technqiues", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'Hard Negative Mining Technqiues'
Top K: 3, Score threshold: 0.1
generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.85it/s]

Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering)
Answer: No relevant context found.
Sources: []
Confidence: 0.0
Context Preview: 



