## Data Ingestion


In [2]:
### Document Structure

from langchain_core.documents import Document

In [3]:
doc = Document(
    page_content="this is the main text content I am using to create RAG",
    metadata = {
        "source": "example.txt",
        "pages": 1,
        "author":"predator",
        "date_created":"2025-12-24"
    }
)

doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'predator', 'date_created': '2025-12-24'}, page_content='this is the main text content I am using to create RAG')

In [4]:
##create a simple directory

import os
os.makedirs("../data/txt_files",exist_ok=True)

In [5]:
sample_texts = {
    "../data/txt_files/python_intro.txt":"""Python Programming Introduction
    
Python is a high-level, interpreted, general-purpose programming language designed to be simple, readable, and powerful.

Key features :

üß† Easy to learn & read ‚Äì clear, English-like syntax

‚ö° Interpreted ‚Äì runs code line by line, no compilation

üåç Cross-platform ‚Äì works on Windows, macOS, Linux

üìö Large standard library ‚Äì built-in tools for many tasks

üîå Rich ecosystem ‚Äì thousands of third-party libraries

üß© Object-oriented & functional ‚Äì supports multiple paradigms

üöÄ Versatile ‚Äì used in web, data science, AI, automation, and more

    """,

     "../data/txt_files/ml_intro.txt":""" Machine Learning Introduction

    Machine Learning (ML) is a branch of artificial intelligence that enables computers to learn from data and improve performance without being explicitly programmed.

Key features

üìä Data-driven ‚Äì learns patterns from data

ü§ñ Self-improving ‚Äì performance improves with more data

üß† Predictive ‚Äì makes predictions or decisions

üîÅ Automated learning ‚Äì reduces manual rule-based coding

üìà Scalable ‚Äì works with large and complex datasets

üåê Wide applications ‚Äì used in vision, speech, recommendation, fraud detection

"""
}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)    

print("Sample text file created")

Sample text file created


In [6]:
###TextLoader

from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/txt_files/python_intro.txt",encoding="utf-8")
print(loader.load())

[Document(metadata={'source': '../data/txt_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted, general-purpose programming language designed to be simple, readable, and powerful.\n\nKey features :\n\nüß† Easy to learn & read ‚Äì clear, English-like syntax\n\n‚ö° Interpreted ‚Äì runs code line by line, no compilation\n\nüåç Cross-platform ‚Äì works on Windows, macOS, Linux\n\nüìö Large standard library ‚Äì built-in tools for many tasks\n\nüîå Rich ecosystem ‚Äì thousands of third-party libraries\n\nüß© Object-oriented & functional ‚Äì supports multiple paradigms\n\nüöÄ Versatile ‚Äì used in web, data science, AI, automation, and more\n\n    ')]


In [7]:
###Directory Loader

from langchain_community.document_loaders import DirectoryLoader

## Load all text files from the directory

dir_loader = DirectoryLoader(
    "../data/txt_files",
     glob="**/*.txt", ## pattern to match file
     loader_cls = TextLoader, ## loader class to use
     loader_kwargs={'encoding':'utf-8'},
     show_progress=False
)

documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\txt_files\\ml_intro.txt'}, page_content=' Machine Learning Introduction\n\n    Machine Learning (ML) is a branch of artificial intelligence that enables computers to learn from data and improve performance without being explicitly programmed.\n\nKey features\n\nüìä Data-driven ‚Äì learns patterns from data\n\nü§ñ Self-improving ‚Äì performance improves with more data\n\nüß† Predictive ‚Äì makes predictions or decisions\n\nüîÅ Automated learning ‚Äì reduces manual rule-based coding\n\nüìà Scalable ‚Äì works with large and complex datasets\n\nüåê Wide applications ‚Äì used in vision, speech, recommendation, fraud detection\n\n'),
 Document(metadata={'source': '..\\data\\txt_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted, general-purpose programming language designed to be simple, readable, and powerful.\n\nKey features :\n\nüß† Easy to learn & read ‚Äì clear, English-like synt

In [8]:
from langchain_community.document_loaders import PyPDFLoader , PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdfs",
     glob="**/*.pdf", ## pattern to match file
    loader_cls= PyMuPDFLoader,
     show_progress=False
)

pdf_documents = dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2025-02-16T16:20:31+05:30', 'source': '..\\data\\pdfs\\6-7Os.pdf', 'file_path': '..\\data\\pdfs\\6-7Os.pdf', 'total_pages': 3, 'format': 'PDF 1.7', 'title': 'OS PRACTICAL \x13 1 (1) - Google Docs', 'author': 'Vinayak Patel', 'subject': '', 'keywords': '', 'moddate': '2025-02-16T16:20:31+05:30', 'trapped': '', 'modDate': "D:20250216162031+05'30'", 'creationDate': "D:20250216162031+05'30'", 'page': 0}, page_content=''),
 Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2025-02-16T16:20:31+05:30', 'source': '..\\data\\pdfs\\6-7Os.pdf', 'file_path': '..\\data\\pdfs\\6-7Os.pdf', 'total_pages': 3, 'format': 'PDF 1.7', 'title': 'OS PRACTICAL \x13 1 (1) - Google Docs', 'author': 'Vinayak Patel', 'subject': '', 'keywords': '', 'moddate': '2025-02-16T16:20:31+05:30', 'trapped': '', 'modDate': "D:20250216162031+05'30'", 'creationDate': "D:20250216162031+05'30'", 'pag

##Embedding and VectorStoreDB


In [9]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""

    def __init__(self,model_name: str = "all-MiniLM-L6-v2"):
        """
            Initialize the embedding manager

            Args:
                model_name : Hugging face Model name for sentence embeddings
        """

        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the sentence Transformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model =    SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension : {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self,text: List[str])->np.ndarray:
        """
        Generate embedding for a list of texts

        Args:
            texts:LIst of text strings to embed

        Return :
            numpy array of embedding with shape (len(texts),embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape : {embeddings.shape}")
        return embeddings
    



    ### Initialize the embedding manager

embedding_manager= EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension : 384


<__main__.EmbeddingManager at 0x25feea0c1a0>

## Vector StoreDB

In [11]:
class VectorStore:
    """ manages document embeddings in a chromaDB vector store """

    def __init__(self,collection_name: str= "pdf_documents", persist_directory: str= "../data/vector_store"):
        """  
        Initialize the vector store

        Args:
            collection_name: Name of chromaDB collection
            persistent_directory: Directory to persist the vector store

        """

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """  
        Initialize chromaDB client and collection
        """

        try:
            # Create persistent chromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection

            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"Description": "Pdf document embedding for RAG"}
            )
            print(f"Vector Store Initialized. Collections: {self.collection_name}")
            print(f"Existing documents in collections   : {self.collection.count()}")

        except Exception as e:
            print(f"Error Initializing Vector Store: {e}")
            raise

vectorstore = VectorStore()
vectorstore

Vector Store Initialized. Collections: pdf_documents
Existing documents in collections   : 0


<__main__.VectorStore at 0x25ff0d2e900>