### Data Ingestion

#### Document Structure

In [16]:
from langchain_core.documents import Document

In [17]:
doc = Document(
    page_content="this is the main text content I am using to create RAG",
    metadata = {
        "source": "example.txt",
        "pages": 1,
        "author": "Sushmita",
        "date_created": "2026-02-16"
    }
)

In [18]:
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Sushmita', 'date_created': '2026-02-16'}, page_content='this is the main text content I am using to create RAG')

#### Create a simple txt file

In [19]:
import os
os.makedirs("../data/text_files", exist_ok=True)

In [20]:
sample_texts={
   "../data/text_files/python.txt" : '''Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.

It is one of the most popular programming languages in the world and is widely used in:

Web development

Data science

Artificial intelligence

Automation

Cybersecurity

Software development

Game development'''
}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)
        
print("Sample file created")

Sample file created


### TextLoader

In [21]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python.txt", encoding="utf-8")
document = loader.load()

In [22]:
print(document)

[Document(metadata={'source': '../data/text_files/python.txt'}, page_content='Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.\n\nIt is one of the most popular programming languages in the world and is widely used in:\n\nWeb development\n\nData science\n\nArtificial intelligence\n\nAutomation\n\nCybersecurity\n\nSoftware development\n\nGame development')]


### Directory Loader

In [23]:
from langchain_community.document_loaders import DirectoryLoader

# load all the text files from the directory

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", # Pattern to match the files
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=False
    
)

documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\python.txt'}, page_content='Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.\n\nIt is one of the most popular programming languages in the world and is widely used in:\n\nWeb development\n\nData science\n\nArtificial intelligence\n\nAutomation\n\nCybersecurity\n\nSoftware development\n\nGame development')]

In [24]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf_files",
    glob = "**/*.pdf",
    loader_cls = PyMuPDFLoader,
    show_progress = False
)

pdf_documents = dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-01-28T19:29:27+05:45', 'source': '..\\data\\pdf_files\\SushmitaMalakar_CV.pdf', 'file_path': '..\\data\\pdf_files\\SushmitaMalakar_CV.pdf', 'total_pages': 2, 'format': 'PDF 1.7', 'title': '', 'author': 'LENOVO', 'subject': '', 'keywords': '', 'moddate': '2026-01-28T19:29:27+05:45', 'trapped': '', 'modDate': "D:20260128192927+05'45'", 'creationDate': "D:20260128192927+05'45'", 'page': 0}, page_content='SUSHMITA MALAKAR \nDATA SCIENCE ENTHUSIAST \n9818085057 | sushmalakar10@gmail.com | Satungal, Kathmandu \nwww.linkedin.com/in/sushmita-malakar-a3a5a9247 \nwww.github.com/sushmitamalakar10 \n \n \n \n \nABOUT ME \nI am passionate and motivated in Data Science. I have completed hands-on projects using Python and \nbasic machine learning techniques. I am confident in data cleaning, exploration and visualization. I am \neager to apply my skills and continue learning through real-wo

In [25]:
type(pdf_documents[0])

langchain_core.documents.base.Document

In [26]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 1 PDF files to process

Processing: SushmitaMalakar_CV.pdf
  ✓ Loaded 2 pages

Total documents loaded: 2


### Text splitting into chunks

In [27]:

### Text splitting get into chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter


def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [28]:
chunks = split_documents(all_pdf_documents)
chunks

Split 2 documents into 4 chunks

Example chunk:
Content: SUSHMITA MALAKAR  
DATA SCIENCE ENTHUSIAST  
9818085057 | sushmalakar10@gmail.com | Satungal, Kathmandu 
www.linkedin.com/in/sushmita-malakar-a3a5a9247 
www.github.com/sushmitamalakar10   
 
 
ABOUT M...
Metadata: {'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-01-28T19:29:27+05:45', 'author': 'LENOVO', 'moddate': '2026-01-28T19:29:27+05:45', 'source': '..\\data\\pdf_files\\SushmitaMalakar_CV.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'SushmitaMalakar_CV.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-01-28T19:29:27+05:45', 'author': 'LENOVO', 'moddate': '2026-01-28T19:29:27+05:45', 'source': '..\\data\\pdf_files\\SushmitaMalakar_CV.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'SushmitaMalakar_CV.pdf', 'file_type': 'pdf'}, page_content='SUSHMITA MALAKAR  \nDATA SCIENCE ENTHUSIAST  \n9818085057 | sushmalakar10@gmail.com | Satungal, Kathmandu \nwww.linkedin.com/in/sushmita-malakar-a3a5a9247 \nwww.github.com/sushmitamalakar10   \n \n \nABOUT ME \nI am passionate and motivated in Data Science. I have completed hands -on projects using Python and \nbasic machine learning techniques. I am confident in data cleaning, exploration and visualization. I am \neager to apply my skills and continue learning through real-world experience. I am especially interested in \ngaining practical knowledge by working on meaningful projects in a collaborative environment. 

### Embedding and vectorStoreDB

In [29]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args: 
            model_name: HuggingFace model name for sentence Embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
        
    
    def generate_embeddings(self, texts: List[str]) -> np.array:
        """
        Generate embeddings for a list of texts
        
        Args: 
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts),embedding_dim)
        """
        
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts..")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self) -> int:
        """Get the embedding dimension of the model"""
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()
    

# Initalize embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 112.62it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x2bcb17d4310>

### Vector Store

In [36]:
import os
import uuid
from typing import List, Any

import numpy as np
import chromadb


class VectorStore:
    """Manage document embeddings in a ChromaDB vector store"""
    
    def __init__(
        self,
        collection_name: str = "pdf_documents",
        persist_directory: str = "../data/vector_store",
    ):
        """
        Initialize the vector store.

        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection."""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"},
            )

            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store.

        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare the data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text,
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise


vectorstore = VectorStore()
vectorstore


Vector store initialized. Collection: pdf_documents
Existing documents in collection: 4


<__main__.VectorStore at 0x2bcb2353050>

In [37]:
# class VectorStore:
#     """Manage document embeddings in a ChromaDB vector store"""
    
#     def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
#         """
#         Initalize the vector store
        
#         Args:
#         collection_name: Name of the ChromaDB Collection
#         persist_directory: Directory to persist the vector store
#         """
        
#         self.collection_name = collection_name
#         self.persist_directory = persist_directory
#         self.client = None
#         self.collection = None
#         self._initialize_store()
        

#     def _initialize_store(self):
#         """Initialize ChromaDB client and collection"""
#         try: 
#             #create persistent ChromaDB client
#             os.makedirs(self.persist_directory, exist_ok=True)
#             self.client = chromadb.PersistentClient(path=self.persist_directory)
            
#             # Get or create collection
#             self.collection = self.client.get_or_create_collection(
#                 name = self.collection_name,
#                 metadata={"description": "PDF document embeddings for RAG"}
#             )
            
#             print(f"Vector store initialized. Collection: {self.collection_name}")
#             print(f"Existing documents in colleciton: {self.collection.count()}")
            
#         except Exception as e:
#             print(f"Error initializing vector store: {e}")


#     def add_documents(self, documents: List[Any], embeddings: np.ndarray):
#         """
#         Add documents and their embeddings to the vector store
        
#         Args:
#             documents: list of LangChain documents
#             embeddings: Corresponding embeddings for the documents
#         """
        
#         if len(documents) != len(embeddings):
#             raise ValueError("Number of documents must match number of embeddings")
        
#         print(f"Adding {len(documents)} documents to vetor store...")
            
            
#         # Prepare the data for ChromaDB
            
#         ids = []
#         metadatas = []
#         documents_text = []
#         embeddings_list = []
        
#         for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
#             # Generate unique ID
#             doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
#             ids.append(doc_id)

#             # Prepare metadata
#             metadata = dict(doc.metadata)
#             metadata['doc_index']
#             metadata['content_length'] = len(doc.page_content)
#             metadatas.append(metadata)
            
#             # Document content
#             documents_text.append(doc.page_content)
            
#             # Embedding
#             embeddings_list.append(embedding.tolist()) 
            
#             # Add to collection
#         try:
#             self.colleciton.add(
#                 ids=ids,
#                 embeddings=embeddings_list,
#                 metadatas = metadata,
#                 documents=documents_text
#             )
#             print(f"Successfully added {len(documents)} documents to vector store")
#             print(f"Total documents in colleciton: {self.collection.count()}")
            
#         except Exception as e:
#             print(f"Error adding documents to vector store: {e}")
#             raise
        
        
# vectorstore = VectorStore()
# vectorstore

In [38]:
chunks

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-01-28T19:29:27+05:45', 'author': 'LENOVO', 'moddate': '2026-01-28T19:29:27+05:45', 'source': '..\\data\\pdf_files\\SushmitaMalakar_CV.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'SushmitaMalakar_CV.pdf', 'file_type': 'pdf'}, page_content='SUSHMITA MALAKAR  \nDATA SCIENCE ENTHUSIAST  \n9818085057 | sushmalakar10@gmail.com | Satungal, Kathmandu \nwww.linkedin.com/in/sushmita-malakar-a3a5a9247 \nwww.github.com/sushmitamalakar10   \n \n \nABOUT ME \nI am passionate and motivated in Data Science. I have completed hands -on projects using Python and \nbasic machine learning techniques. I am confident in data cleaning, exploration and visualization. I am \neager to apply my skills and continue learning through real-world experience. I am especially interested in \ngaining practical knowledge by working on meaningful projects in a collaborative environment. 

In [39]:
# convert the text to embeddings

texts = [doc.page_content for doc in chunks]
texts

['SUSHMITA MALAKAR  \nDATA SCIENCE ENTHUSIAST  \n9818085057 | sushmalakar10@gmail.com | Satungal, Kathmandu \nwww.linkedin.com/in/sushmita-malakar-a3a5a9247 \nwww.github.com/sushmitamalakar10   \n \n \nABOUT ME \nI am passionate and motivated in Data Science. I have completed hands -on projects using Python and \nbasic machine learning techniques. I am confident in data cleaning, exploration and visualization. I am \neager to apply my skills and continue learning through real-world experience. I am especially interested in \ngaining practical knowledge by working on meaningful projects in a collaborative environment. \n \nSKILLS \nLanguages & Tools Python, SQL, Pandas, NumPy, Matplotlib, Seaborn, Scikit -learn, Jupyter Notebook, \nGoogle Colab, GitHub, Flask \nData Skills Data Cleaning, EDA, Regression, Classification, Data Visualization, Data Analysis \nSoft Skills  \n \nPROJECTS \nMenstrual Cycle Predictor',
 'Google Colab, GitHub, Flask \nData Skills Data Cleaning, EDA, Regression, 

In [40]:
# Generate the embeddings
embbeddings = embedding_manager.generate_embeddings(texts)

# Store in the vector database
vectorstore.add_documents(chunks, embbeddings)

Generating embeddings for 4 texts..


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.07it/s]

Generated embeddings with shape: (4, 384)
Adding 4 documents to vector store...
Successfully added 4 documents to vector store
Total documents in collection: 8





In [41]:
embbeddings

array([[-0.01196015, -0.06975466, -0.00867063, ...,  0.03594098,
        -0.06827877,  0.00370967],
       [-0.13038181, -0.10577191, -0.00522372, ..., -0.07579198,
        -0.04352811,  0.02119745],
       [-0.09743939, -0.05643107, -0.03003179, ..., -0.07283021,
        -0.0385066 , -0.00064882],
       [-0.03035072,  0.03633566, -0.00208108, ..., -0.03320826,
        -0.08560763,  0.01523414]], shape=(4, 384), dtype=float32)