### Data Ingestion

#### Document Structure

In [1]:
from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content="this is the main text content I am using to create RAG",
    metadata = {
        "source": "example.txt",
        "pages": 1,
        "author": "Sushmita",
        "date_created": "2026-02-16"
    }
)

In [3]:
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Sushmita', 'date_created': '2026-02-16'}, page_content='this is the main text content I am using to create RAG')

#### Create a simple txt file

In [4]:
import os
os.makedirs("../data/text_files", exist_ok=True)

In [5]:
sample_texts={
   "../data/text_files/python.txt" : '''Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.

It is one of the most popular programming languages in the world and is widely used in:

Web development

Data science

Artificial intelligence

Automation

Cybersecurity

Software development

Game development'''
}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)
        
print("Sample file created")

Sample file created


### TextLoader

In [6]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python.txt", encoding="utf-8")
document = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
print(document)

[Document(metadata={'source': '../data/text_files/python.txt'}, page_content='Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.\n\nIt is one of the most popular programming languages in the world and is widely used in:\n\nWeb development\n\nData science\n\nArtificial intelligence\n\nAutomation\n\nCybersecurity\n\nSoftware development\n\nGame development')]


### Directory Loader

In [8]:
from langchain_community.document_loaders import DirectoryLoader

# load all the text files from the directory

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", # Pattern to match the files
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=False
    
)

documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\python.txt'}, page_content='Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.\n\nIt is one of the most popular programming languages in the world and is widely used in:\n\nWeb development\n\nData science\n\nArtificial intelligence\n\nAutomation\n\nCybersecurity\n\nSoftware development\n\nGame development')]

In [9]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf_files",
    glob = "**/*.pdf",
    loader_cls = PyMuPDFLoader,
    show_progress = False
)

pdf_documents = dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2026-01-28T19:29:27+05:45', 'source': '..\\data\\pdf_files\\SushmitaMalakar_CV.pdf', 'file_path': '..\\data\\pdf_files\\SushmitaMalakar_CV.pdf', 'total_pages': 2, 'format': 'PDF 1.7', 'title': '', 'author': 'LENOVO', 'subject': '', 'keywords': '', 'moddate': '2026-01-28T19:29:27+05:45', 'trapped': '', 'modDate': "D:20260128192927+05'45'", 'creationDate': "D:20260128192927+05'45'", 'page': 0}, page_content='SUSHMITA MALAKAR \nDATA SCIENCE ENTHUSIAST \n9818085057 | sushmalakar10@gmail.com | Satungal, Kathmandu \nwww.linkedin.com/in/sushmita-malakar-a3a5a9247 \nwww.github.com/sushmitamalakar10 \n \n \n \n \nABOUT ME \nI am passionate and motivated in Data Science. I have completed hands-on projects using Python and \nbasic machine learning techniques. I am confident in data cleaning, exploration and visualization. I am \neager to apply my skills and continue learning through real-wo

In [10]:
type(pdf_documents[0])

langchain_core.documents.base.Document

### Embedding and vectorStoreDB

In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args: 
            model_name: HuggingFace model name for sentence Embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
        
    
    def generate_embeddings(self, texts: List[str]) -> np.array:
        """
        Generate embeddings for a list of texts
        
        Args: 
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts),embedding_dim)
        """
        
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts..")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self) -> int:
        """Get the embedding dimension of the model"""
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()
    

# Initalize embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
