In [1]:
import os
import langchain
from typing import List, Dict, Any
import pandas as pd

In [4]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)


  from .autonotebook import tqdm as notebook_tqdm


## Understanding Document Structure in Langchain

In [5]:
## Create a simple document
doc = Document(
    page_content="This is the main text content that will be embedded and searched.",
    metadata={
        "source":"example.txt",
        "page":1,
        "author":"Vikas Jangid",
        "date_created":"2025-11-18",
        "custom_field":"any_value"
    }
)

In [6]:
print(doc)

page_content='This is the main text content that will be embedded and searched.' metadata={'source': 'example.txt', 'page': 1, 'author': 'Vikas Jangid', 'date_created': '2025-11-18', 'custom_field': 'any_value'}


In [7]:
print(doc.page_content)

This is the main text content that will be embedded and searched.


In [8]:
print(doc.metadata)

{'source': 'example.txt', 'page': 1, 'author': 'Vikas Jangid', 'date_created': '2025-11-18', 'custom_field': 'any_value'}


In [9]:
type(doc)

langchain_core.documents.base.Document

## Text Files(.txt)

In [15]:
## Create a simple txt file:

import os
os.makedirs("data/text_files", exist_ok=True)

In [13]:
sample_text = {
    "data/text_files/ai_future.txt":
    """
    Artificial Intelligence is transforming the world faster than any previous technology.
Every week, new tools emerge that automate complex tasks once done by humans.
Many people fear that AI will replace millions of jobs across industries.
But the truth is more balanced — AI is eliminating some roles while creating entirely new ones.
Companies today need AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.
The demand for AI-augmented skills is growing rapidly.
Employees who learn how to use AI tools are suddenly 5–10x more productive.
Instead of competing with AI, people are learning to collaborate with it.
Industries like healthcare, finance, marketing, and education are already seeing huge benefits.
AI is not just about replacement — it's about empowerment.
The future belongs to those who can adapt, reskill, and work alongside intelligent systems.
Countries investing in AI talent today will lead tomorrow’s economy.
And individuals who embrace AI early will unlock incredible career opportunities.
This is not the end of jobs — it’s the beginning of a new kind of workforce.
    """
}

In [17]:
for filepath,content in sample_text.items():
    with open(filepath,"w", encoding="utf-8") as f:
        f.write(content)
        
print("Sample text files created.")

Sample text files created.


### Text Loader - Read Single File

In [18]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/text_files/ai_future.txt", encoding="utf-8")

documents = loader.load()

print(type(documents))
print(documents)

<class 'list'>
[Document(metadata={'source': 'data/text_files/ai_future.txt'}, page_content="\n    Artificial Intelligence is transforming the world faster than any previous technology.\nEvery week, new tools emerge that automate complex tasks once done by humans.\nMany people fear that AI will replace millions of jobs across industries.\nBut the truth is more balanced — AI is eliminating some roles while creating entirely new ones.\nCompanies today need AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.\nThe demand for AI-augmented skills is growing rapidly.\nEmployees who learn how to use AI tools are suddenly 5–10x more productive.\nInstead of competing with AI, people are learning to collaborate with it.\nIndustries like healthcare, finance, marketing, and education are already seeing huge benefits.\nAI is not just about replacement — it's about empowerment.\nThe future belongs to those who can adapt, reskill, and work alongside intelligent syste

### Directory Loader - Multiple Text Files

In [19]:
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader(
    "data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)


documents = loader.load()

for i,doc in enumerate(documents):
    print(f"--- Document {i+1} ---")
    print(doc)
    print("\n")

100%|██████████| 3/3 [00:00<00:00, 88.21it/s]

--- Document 1 ---
page_content='
    Artificial Intelligence is transforming the world faster than any previous technology.
Every week, new tools emerge that automate complex tasks once done by humans.
Many people fear that AI will replace millions of jobs across industries.
But the truth is more balanced — AI is eliminating some roles while creating entirely new ones.
Companies today need AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.
The demand for AI-augmented skills is growing rapidly.
Employees who learn how to use AI tools are suddenly 5–10x more productive.
Instead of competing with AI, people are learning to collaborate with it.
Industries like healthcare, finance, marketing, and education are already seeing huge benefits.
AI is not just about replacement — it's about empowerment.
The future belongs to those who can adapt, reskill, and work alongside intelligent systems.
Countries investing in AI talent today will lead tomorrow’s economy




## **Text Splitter**

In [20]:
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

print(documents)

[Document(metadata={'source': 'data\\text_files\\ai_future.txt'}, page_content="\n    Artificial Intelligence is transforming the world faster than any previous technology.\nEvery week, new tools emerge that automate complex tasks once done by humans.\nMany people fear that AI will replace millions of jobs across industries.\nBut the truth is more balanced — AI is eliminating some roles while creating entirely new ones.\nCompanies today need AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.\nThe demand for AI-augmented skills is growing rapidly.\nEmployees who learn how to use AI tools are suddenly 5–10x more productive.\nInstead of competing with AI, people are learning to collaborate with it.\nIndustries like healthcare, finance, marketing, and education are already seeing huge benefits.\nAI is not just about replacement — it's about empowerment.\nThe future belongs to those who can adapt, reskill, and work alongside intelligent systems.\nCountrie

## **Method(1) - Character Text Splitter**

In [21]:
text  = documents[0].page_content
text

"\n    Artificial Intelligence is transforming the world faster than any previous technology.\nEvery week, new tools emerge that automate complex tasks once done by humans.\nMany people fear that AI will replace millions of jobs across industries.\nBut the truth is more balanced — AI is eliminating some roles while creating entirely new ones.\nCompanies today need AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.\nThe demand for AI-augmented skills is growing rapidly.\nEmployees who learn how to use AI tools are suddenly 5–10x more productive.\nInstead of competing with AI, people are learning to collaborate with it.\nIndustries like healthcare, finance, marketing, and education are already seeing huge benefits.\nAI is not just about replacement — it's about empowerment.\nThe future belongs to those who can adapt, reskill, and work alongside intelligent systems.\nCountries investing in AI talent today will lead tomorrow’s economy.\nAnd individuals w

In [23]:
char_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)


char_chunks = char_splitter.split_text(text)

print(f"Number of Chunks: {len(char_chunks)}")
print(char_chunks)  

Number of Chunks: 7
['Artificial Intelligence is transforming the world faster than any previous technology.\nEvery week, new tools emerge that automate complex tasks once done by humans.', 'Many people fear that AI will replace millions of jobs across industries.\nBut the truth is more balanced — AI is eliminating some roles while creating entirely new ones.', 'Companies today need AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.\nThe demand for AI-augmented skills is growing rapidly.', 'Employees who learn how to use AI tools are suddenly 5–10x more productive.\nInstead of competing with AI, people are learning to collaborate with it.', "Industries like healthcare, finance, marketing, and education are already seeing huge benefits.\nAI is not just about replacement — it's about empowerment.", 'The future belongs to those who can adapt, reskill, and work alongside intelligent systems.\nCountries investing in AI talent today will lead tomorrow’s ec

In [26]:
print(char_chunks[0])
print("-----------------")
print(char_chunks[1])
print("-----------------")
print(char_chunks[2])
print("-----------------")
print(char_chunks[3])

Artificial Intelligence is transforming the world faster than any previous technology.
Every week, new tools emerge that automate complex tasks once done by humans.
-----------------
Many people fear that AI will replace millions of jobs across industries.
But the truth is more balanced — AI is eliminating some roles while creating entirely new ones.
-----------------
Companies today need AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.
The demand for AI-augmented skills is growing rapidly.
-----------------
Employees who learn how to use AI tools are suddenly 5–10x more productive.
Instead of competing with AI, people are learning to collaborate with it.


## **Method(2) - Recursive character Text Splitter**

In [27]:
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], # Try to split by double newlines first, then single newlines, then spaces, then characters
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

In [28]:
recursive_chunks = recursive_splitter.split_text(text)
print(f"Number of Chunks: {len(recursive_chunks)}")
print(recursive_chunks)

Number of Chunks: 7
['Artificial Intelligence is transforming the world faster than any previous technology.\nEvery week, new tools emerge that automate complex tasks once done by humans.', 'Many people fear that AI will replace millions of jobs across industries.\nBut the truth is more balanced — AI is eliminating some roles while creating entirely new ones.', 'Companies today need AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.\nThe demand for AI-augmented skills is growing rapidly.', 'Employees who learn how to use AI tools are suddenly 5–10x more productive.\nInstead of competing with AI, people are learning to collaborate with it.', "Industries like healthcare, finance, marketing, and education are already seeing huge benefits.\nAI is not just about replacement — it's about empowerment.", 'The future belongs to those who can adapt, reskill, and work alongside intelligent systems.\nCountries investing in AI talent today will lead tomorrow’s ec

In [37]:
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=" ", # Try to split by double newlines first, then single newlines, then spaces, then characters
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

In [38]:
recursive_chunks = recursive_splitter.split_text(text)
print(f"Number of Chunks: {len(recursive_chunks)}")
print(recursive_chunks)

Number of Chunks: 7
['Artificial Intelligence is transforming the world faster than any previous technology.\nEvery week, new tools emerge that automate complex tasks once done by humans.\nMany people fear that AI will', 'fear that AI will replace millions of jobs across industries.\nBut the truth is more balanced — AI is eliminating some roles while creating entirely new ones.\nCompanies today need AI specialists,', 'AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.\nThe demand for AI-augmented skills is growing rapidly.\nEmployees who learn how to use AI tools are suddenly', 'tools are suddenly 5–10x more productive.\nInstead of competing with AI, people are learning to collaborate with it.\nIndustries like healthcare, finance, marketing, and education are already seeing', "are already seeing huge benefits.\nAI is not just about replacement — it's about empowerment.\nThe future belongs to those who can adapt, reskill, and work alongside intelligen

## **Method(3) - Token-based Splitter**

In [40]:
token_splitter = TokenTextSplitter(
    chunk_size=40,
    chunk_overlap=10
)


token_chunks = token_splitter.split_text(text)
print(f"Number of Chunks: {len(token_chunks)}")
print(token_chunks)

Number of Chunks: 8
['\n    Artificial Intelligence is transforming the world faster than any previous technology.\nEvery week, new tools emerge that automate complex tasks once done by humans.\nMany people fear that AI will replace', ' humans.\nMany people fear that AI will replace millions of jobs across industries.\nBut the truth is more balanced — AI is eliminating some roles while creating entirely new ones.\nCompanies today need AI', ' creating entirely new ones.\nCompanies today need AI specialists, prompt engineers, data analysts, automation experts, and AI ethicists.\nThe demand for AI-augmented skills is growing rapidly.', ' AI-augmented skills is growing rapidly.\nEmployees who learn how to use AI tools are suddenly 5–10x more productive.\nInstead of competing with AI, people are learning to', 'Instead of competing with AI, people are learning to collaborate with it.\nIndustries like healthcare, finance, marketing, and education are already seeing huge benefits.\nAI is not j