### Data Ingestion

#### Document Structure

In [62]:
from langchain_core.documents import Document

In [63]:
doc = Document(
    page_content="this is the main text content I am using to create RAG",
    metadata = {
        "source": "example.txt",
        "pages": 1,
        "author": "Sushmita",
        "date_created": "2026-02-16"
    }
)

In [64]:
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Sushmita', 'date_created': '2026-02-16'}, page_content='this is the main text content I am using to create RAG')

#### Create a simple txt file

In [65]:
import os
os.makedirs("../data/text_files", exist_ok=True)

In [66]:
sample_texts={
   "../data/text_files/python.txt" : '''Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.

It is one of the most popular programming languages in the world and is widely used in:

Web development

Data science

Artificial intelligence

Automation

Cybersecurity

Software development

Game development'''
}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)
        
print("Sample file created")

Sample file created


### TextLoader

In [67]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python.txt", encoding="utf-8")
document = loader.load()

In [68]:
print(document)

[Document(metadata={'source': '../data/text_files/python.txt'}, page_content='Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.\n\nIt is one of the most popular programming languages in the world and is widely used in:\n\nWeb development\n\nData science\n\nArtificial intelligence\n\nAutomation\n\nCybersecurity\n\nSoftware development\n\nGame development')]


### Directory Loader

In [69]:
from langchain_community.document_loaders import DirectoryLoader

# load all the text files from the directory

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", # Pattern to match the files
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=False
    
)

documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\python.txt'}, page_content='Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.\n\nIt is one of the most popular programming languages in the world and is widely used in:\n\nWeb development\n\nData science\n\nArtificial intelligence\n\nAutomation\n\nCybersecurity\n\nSoftware development\n\nGame development')]

In [70]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf_files",
    glob = "**/*.pdf",
    loader_cls = PyMuPDFLoader,
    show_progress = False
)

pdf_documents = dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Skia/PDF m127', 'creator': 'Chromium', 'creationdate': '2026-02-16T10:46:27+00:00', 'source': '..\\data\\pdf_files\\PMS Symptoms Guide.pdf', 'file_path': '..\\data\\pdf_files\\PMS Symptoms Guide.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2026-02-16T10:46:27+00:00', 'trapped': '', 'modDate': "D:20260216104627+00'00'", 'creationDate': "D:20260216104627+00'00'", 'page': 0}, page_content='Premenstrual Syndrome (PMS) is a combination of physical,\nemotional, and behavioral symptoms that many women experience\nduring the luteal phase of their menstrual cycle, typically occurring\none to two weeks before menstruation begins[1]. It is estimated that\nas many as 3 out of every 4 menstruating women have experienced\nsome form of PMS[1]. The worldwide prevalence of PMS among\nwomen of reproductive age is approximately 47.8%, with about 20%\nexperiencing symptoms severe enough to disrupt daily 

In [71]:
type(pdf_documents[0])

langchain_core.documents.base.Document

In [72]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: PMS Symptoms Guide.pdf
  ✓ Loaded 7 pages

Processing: SushmitaMalakar_CV.pdf
  ✓ Loaded 2 pages

Total documents loaded: 9


### Text splitting into chunks

In [91]:

### Text splitting get into chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter


def split_documents(documents,chunk_size=500,chunk_overlap=100):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [92]:
chunks = split_documents(all_pdf_documents)
chunks

Split 9 documents into 31 chunks

Example chunk:
Content: Prem enstrual Syndrom e (PM S) is a com bination of physical,
em otional, and behavioral sym ptom s that m any wom en experience
during the luteal phase of their m enstrual cycle, typically occurring
...
Metadata: {'producer': 'Skia/PDF m127', 'creator': 'Chromium', 'creationdate': '2026-02-16T10:46:27+00:00', 'moddate': '2026-02-16T10:46:27+00:00', 'source': '..\\data\\pdf_files\\PMS Symptoms Guide.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'source_file': 'PMS Symptoms Guide.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Skia/PDF m127', 'creator': 'Chromium', 'creationdate': '2026-02-16T10:46:27+00:00', 'moddate': '2026-02-16T10:46:27+00:00', 'source': '..\\data\\pdf_files\\PMS Symptoms Guide.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'source_file': 'PMS Symptoms Guide.pdf', 'file_type': 'pdf'}, page_content='Prem enstrual Syndrom e (PM S) is a com bination of physical,\nem otional, and behavioral sym ptom s that m any wom en experience\nduring the luteal phase of their m enstrual cycle, typically occurring\none to two weeks before m enstruation begins[1]. It is estim ated that\nas m any as 3 out of every 4 m enstruating wom en have experienced\nsom e form  of PM S[1]. The worldwide prevalence of PM S am ong\nwom en of reproductive age is approxim ately 47.8%, with about 20%'),
 Document(metadata={'producer': 'Skia/PDF m127', 'creator': 'Chromium', 'creationdate': '2026-02-16T10:46:27+00:00', 'moddate': '2026-02-16T10:46:27+00:00', 'source': '..\\data\\pdf_fi

### Embedding and vectorStoreDB

In [93]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [94]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args: 
            model_name: HuggingFace model name for sentence Embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
        
    
    def generate_embeddings(self, texts: List[str]) -> np.array:
        """
        Generate embeddings for a list of texts
        
        Args: 
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts),embedding_dim)
        """
        
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts..")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self) -> int:
        """Get the embedding dimension of the model"""
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()
    

# Initalize embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 738.34it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x2bcb29ad9d0>

### Vector Store

In [95]:
import os
import uuid
from typing import List, Any

import numpy as np
import chromadb


class VectorStore:
    """Manage document embeddings in a ChromaDB vector store"""
    
    def __init__(
        self,
        collection_name: str = "pdf_documents",
        persist_directory: str = "../data/vector_store",
    ):
        """
        Initialize the vector store.

        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection."""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"},
            )

            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store.

        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare the data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text,
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise


vectorstore = VectorStore()
vectorstore


Vector store initialized. Collection: pdf_documents
Existing documents in collection: 12


<__main__.VectorStore at 0x2bcb2382c50>

In [96]:
# class VectorStore:
#     """Manage document embeddings in a ChromaDB vector store"""
    
#     def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
#         """
#         Initalize the vector store
        
#         Args:
#         collection_name: Name of the ChromaDB Collection
#         persist_directory: Directory to persist the vector store
#         """
        
#         self.collection_name = collection_name
#         self.persist_directory = persist_directory
#         self.client = None
#         self.collection = None
#         self._initialize_store()
        

#     def _initialize_store(self):
#         """Initialize ChromaDB client and collection"""
#         try: 
#             #create persistent ChromaDB client
#             os.makedirs(self.persist_directory, exist_ok=True)
#             self.client = chromadb.PersistentClient(path=self.persist_directory)
            
#             # Get or create collection
#             self.collection = self.client.get_or_create_collection(
#                 name = self.collection_name,
#                 metadata={"description": "PDF document embeddings for RAG"}
#             )
            
#             print(f"Vector store initialized. Collection: {self.collection_name}")
#             print(f"Existing documents in colleciton: {self.collection.count()}")
            
#         except Exception as e:
#             print(f"Error initializing vector store: {e}")


#     def add_documents(self, documents: List[Any], embeddings: np.ndarray):
#         """
#         Add documents and their embeddings to the vector store
        
#         Args:
#             documents: list of LangChain documents
#             embeddings: Corresponding embeddings for the documents
#         """
        
#         if len(documents) != len(embeddings):
#             raise ValueError("Number of documents must match number of embeddings")
        
#         print(f"Adding {len(documents)} documents to vetor store...")
            
            
#         # Prepare the data for ChromaDB
            
#         ids = []
#         metadatas = []
#         documents_text = []
#         embeddings_list = []
        
#         for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
#             # Generate unique ID
#             doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
#             ids.append(doc_id)

#             # Prepare metadata
#             metadata = dict(doc.metadata)
#             metadata['doc_index']
#             metadata['content_length'] = len(doc.page_content)
#             metadatas.append(metadata)
            
#             # Document content
#             documents_text.append(doc.page_content)
            
#             # Embedding
#             embeddings_list.append(embedding.tolist()) 
            
#             # Add to collection
#         try:
#             self.colleciton.add(
#                 ids=ids,
#                 embeddings=embeddings_list,
#                 metadatas = metadata,
#                 documents=documents_text
#             )
#             print(f"Successfully added {len(documents)} documents to vector store")
#             print(f"Total documents in colleciton: {self.collection.count()}")
            
#         except Exception as e:
#             print(f"Error adding documents to vector store: {e}")
#             raise
        
        
# vectorstore = VectorStore()
# vectorstore

In [97]:
chunks

[Document(metadata={'producer': 'Skia/PDF m127', 'creator': 'Chromium', 'creationdate': '2026-02-16T10:46:27+00:00', 'moddate': '2026-02-16T10:46:27+00:00', 'source': '..\\data\\pdf_files\\PMS Symptoms Guide.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'source_file': 'PMS Symptoms Guide.pdf', 'file_type': 'pdf'}, page_content='Prem enstrual Syndrom e (PM S) is a com bination of physical,\nem otional, and behavioral sym ptom s that m any wom en experience\nduring the luteal phase of their m enstrual cycle, typically occurring\none to two weeks before m enstruation begins[1]. It is estim ated that\nas m any as 3 out of every 4 m enstruating wom en have experienced\nsom e form  of PM S[1]. The worldwide prevalence of PM S am ong\nwom en of reproductive age is approxim ately 47.8%, with about 20%'),
 Document(metadata={'producer': 'Skia/PDF m127', 'creator': 'Chromium', 'creationdate': '2026-02-16T10:46:27+00:00', 'moddate': '2026-02-16T10:46:27+00:00', 'source': '..\\data\\pdf_fi

In [98]:
# convert the text to embeddings

texts = [doc.page_content for doc in chunks]
texts

['Prem enstrual Syndrom e (PM S) is a com bination of physical,\nem otional, and behavioral sym ptom s that m any wom en experience\nduring the luteal phase of their m enstrual cycle, typically occurring\none to two weeks before m enstruation begins[1]. It is estim ated that\nas m any as 3 out of every 4 m enstruating wom en have experienced\nsom e form  of PM S[1]. The worldwide prevalence of PM S am ong\nwom en of reproductive age is approxim ately 47.8%, with about 20%',
 'wom en of reproductive age is approxim ately 47.8%, with about 20%\nexperiencing sym ptom s severe enough to disrupt daily activities[2].\nPM S sym ptom s are highly variable and can range from  barely\nnoticeable to severe. The physical and em otional changes associated\nwith PM S tend to recur in a predictable pattern with each m enstrual\ncycle and generally disappear within four days after m enstruation\nbegins[1].\nPM S m anifests through three prim ary categories of sym ptom s:',
 'begins[1].\nPM S m anifest

In [99]:
# Generate the embeddings
embbeddings = embedding_manager.generate_embeddings(texts)

# Store in the vector database
vectorstore.add_documents(chunks, embbeddings)

Generating embeddings for 31 texts..


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.34it/s]

Generated embeddings with shape: (31, 384)
Adding 31 documents to vector store...
Successfully added 31 documents to vector store
Total documents in collection: 43





In [100]:
embbeddings

array([[ 0.0370128 , -0.04051747,  0.01605419, ..., -0.01399649,
         0.01553751,  0.03588168],
       [ 0.03447962, -0.06473775, -0.01812496, ..., -0.00894109,
        -0.02101575,  0.00146279],
       [ 0.05352686, -0.05824004,  0.04750064, ...,  0.04877291,
        -0.03266094,  0.01818302],
       ...,
       [-0.03747059, -0.01563412, -0.03154563, ..., -0.09105401,
        -0.05071722,  0.00386766],
       [-0.03717997, -0.05967765, -0.02222466, ...,  0.05950106,
         0.00529717, -0.05120308],
       [-0.03035072,  0.03633566, -0.00208108, ..., -0.03320826,
        -0.08560763,  0.01523414]], shape=(31, 384), dtype=float32)

### Retriever Pipeline from VectorStore

In [101]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 6, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [102]:
rag_retriever

<__main__.RAGRetriever at 0x2bcb28b9b10>

In [103]:
rag_retriever.retrieve("explain the symptoms of PMS?")

Retrieving documents for query: 'explain the symptoms of PMS?'
Top K: 6, Score threshold: 0.0
Generating embeddings for 1 texts..


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 26.77it/s]

Generated embeddings with shape: (1, 384)
Retrieved 6 documents (after filtering)





[{'id': 'doc_b9a299fd_2',
  'content': 'begins[1].\nPM S m anifests through three prim ary categories of sym ptom s:\nem otional and behavioral, physical, and cognitive sym ptom s.\nThe em otional and behavioral m anifestations of PM S can\nsigni\x00cantly im pact daily functioning and interpersonal\nrelationships[3][4].\nPrem enstrual Syndrom e (PM S):\nA Com prehensive Guide to\nSym ptom s\nIntroduction\nTypes of PM S Sym ptom s\nEm otional and Behavioral Sym ptom s\nTension or anxiety•\nDepressed m ood and feelings of sadness•\nCrying spells•',
  'metadata': {'total_pages': 7,
   'producer': 'Skia/PDF m127',
   'creationdate': '2026-02-16T10:46:27+00:00',
   'creator': 'Chromium',
   'source': '..\\data\\pdf_files\\PMS Symptoms Guide.pdf',
   'page': 0,
   'doc_index': 2,
   'page_label': '1',
   'content_length': 494,
   'moddate': '2026-02-16T10:46:27+00:00',
   'file_type': 'pdf',
   'source_file': 'PMS Symptoms Guide.pdf'},
  'similarity_score': 0.16529369354248047,
  'distance'