## Data Loading

In [20]:
import os
from langchain.document_loaders import TextLoader, PyMuPDFLoader, CSVLoader, BSHTMLLoader,UnstructuredXMLLoader, PythonLoader

def process_all_docs(data_directory):
    loaders = {
        ".txt": TextLoader,
        ".pdf": PyMuPDFLoader,
        ".csv": CSVLoader,
        ".html": BSHTMLLoader,
        ".xml": UnstructuredXMLLoader,
        ".py": PythonLoader,           
    }
    summary = []
    all_documents = []

    # Walk directory recursively to handle nested folders
    for root, _, files in os.walk(data_directory):
        for filename in files:
            ext = os.path.splitext(filename)[1].lower()
            loader_cls = loaders.get(ext)
            if not loader_cls:
                summary.append((filename, ext, "SKIPPED (not supported)", 0))
                continue
            file_path = os.path.join(root, filename)
            try:
                loader = loader_cls(file_path)
                docs = loader.load()
                all_documents.extend(docs)
                summary.append((filename, ext, "OK", len(docs)))
                print(f"✓ {filename}: {len(docs)} docs")
            except Exception as e:
                summary.append((filename, ext, f"ERROR ({e})", 0))
                print(f"✗ {filename}: {e}")

    # Print summary table
    print("\n--- Ingestion Summary ---")
    print(f"{'File':50} {'Ext':5} {'Status':25} {'Docs'}")
    for s in summary:
        print(f"{s[0]:50} {s[1]:5} {s[2]:25} {s[3]}")
    print(f"\nTotal loaded documents: {len(all_documents)}")
    return all_documents


In [21]:
# Chunking 
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [22]:
all_documents = process_all_docs("../data/all_files")
chunks=split_documents(all_documents)

✓ Pride and Prejudice Author Jane Austen.pdf: 516 docs

--- Ingestion Summary ---
File                                               Ext   Status                    Docs
Pride and Prejudice Author Jane Austen.pdf         .pdf  OK                        516

Total loaded documents: 516
Split 516 documents into 989 chunks

Example chunk:
Content: PRIDE AND 
PREJUDICE
Jane Austen
InfoBooks.org...
Metadata: {'producer': '3-Heights™ PDF Optimization Shell 6.3.1.5 (http://www.pdf-tools.com)', 'creator': 'Adobe Acrobat Pro DC 20.9.20063', 'creationdate': '2022-07-01T09:23:19-04:00', 'source': '../data/all_files\\Pride and Prejudice Author Jane Austen.pdf', 'file_path': '../data/all_files\\Pride and Prejudice Author Jane Austen.pdf', 'total_pages': 516, 'format': 'PDF 1.7', 'title': 'pride and prejudice', 'author': 'jane austen', 'subject': '', 'keywords': 'pride and prejudice by jane austen, pride, prejudice, jane austen, pride and prejudice, jane', 'moddate': '2024-03-13T17:08:16-03:00', 'tr

## Embedding Data into VectorDB

In [23]:
# Embedding and VectorDB
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb 
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity
import os
from langchain_core.documents import Document

### EmbeddingManager

In [24]:
class EmbeddingManager:
    """Handles Document Embedding Generation using SentenceTransformer"""
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading Embedding Model:{self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded Succesfully , Embedding Dimensions :{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error:{e}")
            raise

    def generate_embeddings(self, texts: list) -> np.ndarray: 
        """Generate an embedding vector for the given text"""
        if self.model is None: 
            raise ValueError("ModelNotLoaded")
        print(f"Generating Embeddings for {len(texts)} text(s)...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated Embedding Model with shape {embeddings.shape}")
        return embeddings

# Start the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading Embedding Model:all-MiniLM-L6-v2
Model Loaded Succesfully , Embedding Dimensions :384


<__main__.EmbeddingManager at 0x298805c9160>

### Vector Store

In [25]:
class VectorStore:
    """Manages documents embedding in a chromaDB vector store """

    def __init__(self,collection_name: str = "pdf_documents",persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self):
        """Initialize the ChromaDB client and collection"""
        try:
            # Create Persistent Directory
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            # Get or create collection
            print("Initializing ChromaDB Client...")
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF Document Collection"}
            )
            print(f"Collection '{self.collection_name}' is ready.")
        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise

    def add_documents(self, documents: List[Document], embeddings: np.ndarray):
        """Add documents to the vector store after generating embeddings"""
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match the number of embeddings")
        
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Initializing ChromaDB Client...
Collection 'pdf_documents' is ready.


<__main__.VectorStore at 0x298805c9550>

In [26]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Generating Embeddings for 989 text(s)...


Batches: 100%|██████████| 31/31 [00:21<00:00,  1.44it/s]


Generated Embedding Model with shape (989, 384)
Successfully added 989 documents to vector store
Total documents in collection: 11868


## Retriver pipeline from Vector Store

In [27]:
class RAGRetriever:
    """Handles query based retrival from Vector Store"""
    def __init__(self, vector_store : VectorStore, embedding_manager : EmbeddingManager):
        """Initialize the retriver
        
        Args:
            vector_store: vector store for containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)



In [28]:
rag_retriever

<__main__.RAGRetriever at 0x298ca6ea7b0>

In [29]:
# rag_retriever.retrieve("what is intra-task forgetting")

## LLM with RAG integrtion

In [30]:
## Simple RAG LLM 
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

llm = ChatGroq(groq_api_key=groq_api_key,model_name="gemma2-9b-it",temperature=0.1,max_tokens=1024)

In [31]:
## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt="""Carefully read the provided context in {context} and answer the question: {query}

    If the answer is found, quote the relevant text directly and reference its location (chapter, page, or section if given).

    If helpful, include 1–2 surrounding sentences.

    If the answer must be combined from different parts, use only context quotes.

    """
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [32]:
answer=rag_simple("Who is Mrs. Long?",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'Who is Mrs. Long?'
Top K: 3, Score threshold: 0.0
Generating Embeddings for 1 text(s)...


Batches: 100%|██████████| 1/1 [00:00<00:00, 16.95it/s]

Generated Embedding Model with shape (1, 384)
Retrieved 3 documents (after filtering)





Mrs. Long is a friend of Mrs. Bennet who is described as "as good a creature as ever lived." 

Here's the relevant text:

"I do think Mrs. Long is as good a creature as ever lived—and her nieces are very pretty behaved girls, and not at all handsome: I like them prodigiously.” 



