### Loading a text file (basic)

In [None]:
## load a single text file
from langchain_community.document_loaders import TextLoader

loader = TextLoader("rag_data/leave_policy.txt", encoding="utf-8")

documents = loader.load()
print(f" Loaded {len(documents)} file")
print(f"{documents[0].metadata}")
print(f"First 50 chars...{documents[0].page_content[:50]}")

In [None]:
## load a directory 
## load all text file inside the rag_data directory

from langchain_community.document_loaders import DirectoryLoader, TextLoader

dir_loader = DirectoryLoader(
    "rag_data",
    glob= "**/*.txt",
    loader_cls = TextLoader,
    loader_kwargs={'encoding': 'utf-8'},
    show_progress= True
)

loaded_files = dir_loader.load()
print(f"loaded files lengh: {len(loaded_files)}")

for i, doc in enumerate(loaded_files):
    print(f" meta data: {doc.metadata}")
    print(f" \n page content: {doc.page_content[:10]} ....")

In [None]:
## Text spliting techniques

from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter
text_content = documents[0].page_content

### Character text Splitter
char_splitter = CharacterTextSplitter(
    separator = "\n",  # split based on new lines
    chunk_size = 200, # max chunk size characters
    chunk_overlap= 20, # overlap between chunks
)
chunks_list = char_splitter.split_text(text_content)
print(f"******charactor text splitting**************")
print(f"chunk size {len(chunks_list)}")
print(f"first chunk:: {chunks_list[0]}")


### Recursive text splitter
recursive_chr_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " ", ""],
    chunk_size = 200,
    chunk_overlap = 20, 
)
chunks_list = recursive_chr_splitter.split_text(text_content)
print(f"******Recursive text splitting**************")
print(f"chunk size {len(chunks_list)}")
print(f"first chunk:: {chunks_list[0]}")

### Token based text splitter
token_chr_splitter = TokenTextSplitter(
    chunk_size = 100,
    chunk_overlap = 20
)
chunks_list = token_chr_splitter.split_text(text_content)
print(f"******Token text splitting**************")
print(f"chunk size {len(chunks_list)}")
print(f"first chunk:: {chunks_list[0]}")


### PDF file loader 

It's same like text file loader only but we may face isssues with pdf file because pdf means not only text it may have images, diagrams, and some empty spaces so best practice is we need to do cleanup.

below code will load pdf file using pyPDF loader and split the text content using recursive character.

In [None]:
## PDF spliting statergies
from typing import List
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document # Document type

print ("### pdf loading using pydfLoader")

class PDFprocessor:
    ### best way to processing the pdf file 
    def __init__(self, chunk_size=200, chunk_overlap=100, separators = [" "]):
        self.chunk_size = chunk_size
        self.chunk_overlap=chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap,
            separators= separators
        )
    def _clean_text(self, text: str)-> str:
        # remove extra spaces
        text = " ".join(text.split())
        return text

    def process_pdf(self, pdf_path)->List[Document]:
        #load pdf file
        loader = PyPDFLoader(pdf_path)
        pdf_pages = loader.load()
        processed_chunks = []
        ## cleanup each pages before split
        for page_num, page in enumerate(pdf_pages):
            cleaned_text = self._clean_text(page.page_content)
            chunks = self.text_splitter.create_documents(
                texts = [cleaned_text],
                metadatas= [
                    {
                        **page.metadata,
                        "page": page_num + 1,
                        "total_pages": len(pdf_pages),
                        "chunk_method": "pdf_processor"
                    }
                ]
            )
            processed_chunks.extend(chunks)

        return processed_chunks

try:
    preprocess = PDFprocessor()
    output_chunks = preprocess.process_pdf("rag_data/work_timing_policy.pdf")
    print(f"chunks {len(output_chunks)} created")
    print(f"{output_chunks[0].page_content}")
except Exception as e:
    print(f"error:: {e}")

### Word document processing

In [None]:
from langchain_community.document_loaders import Docx2txtLoader, UnstructuredWordDocumentLoader

print("Using Docx2txtLoader")

try:
    docx_loader = Docx2txtLoader("rag_data/frontend.docx")
    docs = docx_loader.load()
    print(f"Loaded {len(docs)} documents")
    print(f"page content {docs[0].page_content[:100]}...")
except Exception as e:
    print(f"error:: {e}")


### CSV and exeel files - sturcture data

In [None]:
from langchain_community.document_loaders import CSVLoader

csv_loader = CSVLoader( 
    file_path='rag_data/dividend.csv',
    encoding='utf-8',
    csv_args= {
        'delimiter': ',',
        'quotechar':  '"'
    }
)

csv_docs = csv_loader.load()
print(f"loaded {len(csv_docs)} rows") ## each row will be a one doc
print(f"\n First row:")
print(f'\n content: {csv_docs[2].page_content}')
print(f'\n content: {csv_docs[2].metadata}')

### JSON parsking and processing

we may get json result from an api so we should know know how to process it

In [None]:

from langchain_community.document_loaders import JSONLoader

json_loader = JSONLoader(
    file_path = 'rag_data/employee.json',
    jq_schema= ".[]", # use jq to extract items (if your file contains a list)
    text_content = False
)

json = json_loader.load()

print(f"data loaded {json[0].page_content}")


### Database loader

In [None]:
## create simple SQLite database and popultate test data

import sqlite3
import os

# Database file path
db_file = "rag_data/company.db"

# If DB exists, delete it to start fresh
if os.path.exists(db_file):
    os.remove(db_file)
    print("Existing database removed.")

# Create new SQLite connection
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

# Create tables
cursor.execute('''
CREATE TABLE IF NOT EXISTS employees (
    id INTEGER PRIMARY KEY,
    name TEXT,
    role TEXT,
    department TEXT,
    salary REAL
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS projects (
    id INTEGER PRIMARY KEY,
    name TEXT,
    status TEXT,
    budget REAL,
    lead_id INTEGER,
    FOREIGN KEY (lead_id) REFERENCES employees(id)
)
''')

# Insert dummy employee data
employees = [
    (1, "Alice", "Developer", "Engineering", 90000),
    (2, "Bob", "Manager", "Sales", 105000),
    (3, "Charlie", "Analyst", "Finance", 80000),
    (4, "Diana", "Designer", "UI/UX", 85000),
]

cursor.executemany('''
INSERT INTO employees (id, name, role, department, salary)
VALUES (?, ?, ?, ?, ?)
''', employees)

# Insert dummy project data
projects = [
    (1, "Project Alpha", "Active", 250000, 2),
    (2, "Project Beta", "Planning", 150000, 1),
    (3, "Project Gamma", "Completed", 300000, 3),
]

cursor.executemany('''
INSERT INTO projects (id, name, status, budget, lead_id)
VALUES (?, ?, ?, ?, ?)
''', projects)

cursor.execute("select * from employees")
print(cursor.fetchall()[0])
# Commit and close connection
conn.commit()
conn.close()

print("Database created and populated successfully.")

In [None]:
import sqlite3
from typing import List
from langchain_core.documents import Document
db_file = "rag_data/company.db"
def SQL_to_documents(db_path: str) -> List[Document]:
    """
    Reads all tables in an SQLite database and converts them into a list of LangChain Document objects.
    Each Document represents one table, containing all its rows as text.
    """
    documents = []
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Get all table names
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = [row[0] for row in cursor.fetchall()]

    for table in tables:
        # Fetch all data from the table
        cursor.execute(f"SELECT * FROM {table};")
        rows = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]

        # Combine all rows into text
        table_data = "\n".join(
            [", ".join(f"{col}={val}" for col, val in zip(columns, row)) for row in rows]
        )

        # Create a Document for the table
        doc = Document(
            page_content=f"Table: {table}\n{table_data}",
            metadata={"table_name": table, "columns": columns, "row_count": len(rows)}
        )
        documents.append(doc)

    conn.close()
    return documents

print("DATABASE DATA")
print(SQL_to_documents(db_file))

### Embeddings

- in short we are taking the text and convert into embeddings(numberical represation)
- based on search (cosine similarity search) it give the score of matching

In [None]:
## simple example for embeddings
import numpy as np 
# simple 3D example  (real embeddings have 100+ Dimensions)
# Let's take movies search, with 3 axes (Action, Comedy, Suspense)
# Each axis value represents the intensity/score of Action, Comedy, Suspense

word_embeddings = {
    "Iron_Man": [0.8, 0.3, 0.2],
    "Hulk": [0.9, 0.3, 0.2],
    "Shawshank_Redemption": [0.2, 0.3, 0.9]
}

# Similarity check
from sklearn.metrics.pairwise import cosine_similarity

# Extract vectors
vec1 = word_embeddings["Iron_Man"]
vec2 = word_embeddings["Hulk"]
vec3 = word_embeddings["Shawshank_Redemption"]

# Compute pairwise cosine similarities
sim_iron_hulk = cosine_similarity([vec1], [vec2])[0][0]
sim_iron_shawshank = cosine_similarity([vec1], [vec3])[0][0]
sim_hulk_shawshank = cosine_similarity([vec2], [vec3])[0][0]

# Print results
print(f"Similarity (Iron_Man vs Hulk): {sim_iron_hulk:.3f}")
print(f"Similarity (Iron_Man vs Shawshank_Redemption): {sim_iron_shawshank:.3f}")
print(f"Similarity (Hulk vs Shawshank_Redemption): {sim_hulk_shawshank:.3f}")

#We represent each movie as a 3D vector:
#Axis 1 → Action
#Axis 2 → Comedy
#Axis 3 → Suspense
#cosine_similarity measures how directionally similar two vectors are (values between -1 and 1).
# close 1 means it matching, -1 close to -1 not matching


In [None]:
#let use real models to convert the text into embeddings

from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

text = "hello, i am learning embeedings"
embed = embeddings.embed_query(text) # query means single line 
# embeds = embeddings.embed_documents([..]) # list of string call documents
print(f" Text: {text}")
print(f"Embedding length: {len(embed)}") #384 is static lenght because this model using 384D
print(embed)


In [None]:
# function four cosine similrarity search
def cosine_similarity(vec1, vec2):
    """
    Calculate the cosine similarity between two vectors.

    Cosine similarity measures the angle between two vectors in a multi-dimensional space.
    - A value close to 1 → vectors point in the same direction (very similar)
    - A value close to 0 → vectors are orthogonal (not related)
    - A value close to -1 → vectors point in opposite directions (opposite meanings)
    """

    # Step 1: Compute the dot product of the two vectors
    # The dot product measures how much two vectors point in the same direction.
    dot_product = np.dot(vec1, vec2)

    # Step 2: Compute the L2 norm (magnitude) of each vector
    # The norm represents the length (magnitude) of a vector.
    # np.linalg.norm() calculates the square root of the sum of squared vector elements.
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)

    # Step 3: Calculate cosine similarity
    # Formula: cos(θ) = (A · B) / (||A|| * ||B||) 
    # This gives the cosine of the angle θ between the two vectors.
    cosine_sim = dot_product / (norm_a * norm_b)

    # Step 4: Return the similarity score
    return cosine_sim



In [None]:
#example docs and query 

documents = [
    "LangChain is a framework used to build applications powered by large language models.",
    "Python is a high-level programming language widely used for automation and data processing.",
    "Machine learning is a field of artificial intelligence that focuses on training models from data.",
    "Embeddings convert text into numerical vectors so semantic similarity can be measured.",
    "The weather is sunny today with clear skies and warm temperatures."
]

question = "Is it raining today?"

# creating a method for semantic search
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

def semantic_search(query, documents, top_k=3):
    #embed query and document
    query_embedding = embeddings.embed_query(query) # this will convert into the embedding (vector numbers)
    query_docs  = embeddings.embed_documents( documents) # this will convert into the list of embeddings

    #calculate the similartiy score
    similiraties = []

    for i,doc in enumerate(query_docs):
        sim = cosine_similarity(query_embedding, doc)
        similiraties.append((sim, documents[i]))

    # sort by similiraties
    similiraties.sort(reverse=True)
    return similiraties[:top_k]
print(f"Question: {question}")
print(f"matching results: ${semantic_search(question, documents, 2)}")


### Store the embeddings into a vector store

In [None]:
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# your raw texts
documents = [
    "LangChain is a framework used to build applications powered by large language models.",
    "Python is a high-level programming language widely used for automation and data processing.",
    "Machine learning is a field of artificial intelligence that focuses on training models from data.",
    "Embeddings convert text into numerical vectors so semantic similarity can be measured.",
    "The weather is sunny today with clear skies and warm temperatures."
]

# create DocumentList
doc_list = [Document(page_content=text) for text in documents]

# choose an embedding model (simple + local)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# store in Chroma
vector_store = Chroma.from_documents(
    documents=doc_list,
    embedding=embedding_model,
    collection_name="info_docs",
    persist_directory="./chrom_db"
)

print("Stored successfully!")
question = "what is large language models"
print("SEarch question:"+ question)
print("*******RESULT*********")
print(vector_store.similarity_search(question, 3))
print(vector_store.similarity_search_with_score(question, 3))


### Vector Database

In [None]:
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv()

def save_into_vector(doc_list):
    # 1. Create embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    # 2. Initialize Pinecone client
    pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

    index_name = "leave-policy-index" # eq to DB name

    # 3. Create the Pinecone index if it doesn’t exist
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=384,  # MiniLM-L6-v2 embedding dimension
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )

    # 4. Connect to the index
    index = pc.Index(index_name)

    # 5. Create vector store (stores docs into Pinecone)
    vector_store = PineconeVectorStore.from_documents(
        documents=doc_list,
        embedding=embeddings,
        index_name=index_name
    )

    print("Documents stored in Pinecone")
    return vector_store




# your raw texts
documents = [
    "LangChain is a framework used to build applications powered by large language models.",
    "Python is a high-level programming language widely used for automation and data processing.",
    "Machine learning is a field of artificial intelligence that focuses on training models from data.",
    "Embeddings convert text into numerical vectors so semantic similarity can be measured.",
    "The weather is sunny today with clear skies and warm temperatures."
]

# create DocumentList
doc_list = [Document(page_content=text) for text in documents]

vector_DB = save_into_vector(doc_list)

output = vector_DB.similarity_search("LLM framework", k=2)

print(output)

### Semantic chunking