### Data Ingestion

### document datastructure

In [1]:
from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content = "this is the main text content I amusing to create RAG",
    metadata = {
        "source" : "example.txt",
        "pages" : 1,
        "author" : "Tanmayi Bhavsar",
        "date_created" : "2026-01-27"
    }
)

doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Tanmayi Bhavsar', 'date_created': '2026-01-27'}, page_content='this is the main text content I amusing to create RAG')

In [3]:
## create a simple txt file
import os
os.makedirs("../data/text_files",exist_ok=True)

In [4]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print(" Sample text files created!")

 Sample text files created!


In [5]:
### TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [7]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## Load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/text_files",
    glob = "**/*.txt", ##Pattern to match files
    loader_cls = TextLoader, ##Loader class to use
    loader_kwargs = {'encoding':'utf-8'},
    show_progress = False
)

documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n    \n    \n    '),
 Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the m

In [None]:
### PDF loader
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader

## load pdf files from directory
dir_loader = DirectoryLoader(
    "../data/Pdfs",
glob = "**/*.pdf",
loader_cls = PyMuPDFLoader,
show_progress = False
)

pdf_documents = dir_loader.load()
pdf_documents


[Document(metadata={'producer': '', 'creator': 'WPS Writer', 'creationdate': '2025-07-08T16:04:32+05:30', 'source': '..\\data\\Pdfs\\Practical Machine Learning.pdf', 'file_path': '..\\data\\Pdfs\\Practical Machine Learning.pdf', 'total_pages': 7, 'format': 'PDF 1.7', 'title': '', 'author': 'binay', 'subject': '', 'keywords': '', 'moddate': '2025-07-08T16:04:32+05:30', 'trapped': '', 'modDate': "D:20250708160432+05'30'", 'creationDate': "D:20250708160432+05'30'", 'page': 0}, page_content='ACTS, Pune\nPG-DBDA\nPage 1 of 5\nSuggested Teaching Guidelines for\nPractical Machine Learning\nPG-DBDA August 2025\nDuration: 60 hours Theory and 80 hours Lab\nObjective: Practicing Machine Learning Algorithms\nPrerequisites: Good knowledge of Python Programming and Statistics\nEvaluation method:\nTheory exam– 40% weightage\nLab exam – 40% weightage\nInternal exam– 20% weightage\nList of Books / Other training material\nTextbook:\nMachine Learning, Saikat Dutt / Pearson\nReference Book:\nMachine Lear

In [10]:
type(pdf_documents[0])

langchain_core.documents.base.Document

In [11]:
### CSV file
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader('../data/CSV_files/covid-19_dataset.csv')
doc = loader.load()
print(doc)

[Document(metadata={'source': '../data/CSV_files/covid-19_dataset.csv', 'row': 0}, page_content='DeadCounter: 1\nCaseCounter: 72\nRecoveredCounter: 188\nCriticalCounter: -9\nDate: 05-09-2021'), Document(metadata={'source': '../data/CSV_files/covid-19_dataset.csv', 'row': 1}, page_content='DeadCounter: 1\nCaseCounter: 103\nRecoveredCounter: 187\nCriticalCounter: -5\nDate: 04-09-2021'), Document(metadata={'source': '../data/CSV_files/covid-19_dataset.csv', 'row': 2}, page_content='DeadCounter: 1\nCaseCounter: 95\nRecoveredCounter: 113\nCriticalCounter: -11\nDate: 03-09-2021'), Document(metadata={'source': '../data/CSV_files/covid-19_dataset.csv', 'row': 3}, page_content='DeadCounter: 1\nCaseCounter: 111\nRecoveredCounter: 150\nCriticalCounter: -9\nDate: 02-09-2021'), Document(metadata={'source': '../data/CSV_files/covid-19_dataset.csv', 'row': 4}, page_content='DeadCounter: 1\nCaseCounter: 101\nRecoveredCounter: 252\nCriticalCounter: -6\nDate: 01-09-2021'), Document(metadata={'source': '

### embedding and VectorStoreDB

In [14]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    def __init__(self,model_name:str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
        model_name : HuggingFace Model name for sentence embedding
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model:{self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded sucessfully.Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model{self.model_name}:{e}")
            raise 

    def generate_embeddings(self,texts:List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts
        
        Args:
        texts: List of text strings to embed
        
        Returns:
        numpy array of embeddings with shape(len(texts),embedding_dim)"""

        if not self.model:
            raise ValueError("Model not loaded")

        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape:{embeddings.shape}")
        return embeddings  

    def get_embedding_dimension(self) -> int:
        """Get the embedding dimension of the model"""
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()  

## initialize the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager 

Loading embedding model:all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 291.79it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded sucessfully.Embedding dimension: 384


<__main__.EmbeddingManager at 0x20fdfd03370>

### VectorStore

In [16]:
class VectorStore:
    """Manages document embedding in a ChromaDB vector store"""

    def __init__(self,collection_name:str = "pdf_documents",persist_directory:str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
        collection_name : Name of the ChromaDB collection
        persist_directory : Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore        

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x20fdfd00160>