In [1]:
#Task 9: Chunking
#Implement different types of document loaders and chunk the data using different chunking strategies.

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)
from langchain_core.documents import Document

# LOAD DOCUMENT (inline sample data)
def load_document():
    text = """
    Artificial Intelligence (AI) is transforming industries across the globe.
    From healthcare to finance, AI-driven solutions are improving efficiency,
    accuracy, and decision-making. One of the most impactful applications of AI
    is Natural Language Processing (NLP), which enables machines to understand
    and generate human language. Tools like chatbots, virtual assistants, and
    automated translation services rely heavily on NLP.

    Another critical area is computer vision, where AI systems can interpret
    and analyze visual data. This technology powers facial recognition,
    autonomous vehicles, and medical imaging diagnostics. As AI continues to
    evolve, ethical considerations such as bias, transparency, and privacy
    remain central to its development.

    Developers and engineers often use frameworks like TensorFlow, PyTorch,
    and LangChain to build AI applications. LangChain, in particular, is
    designed to simplify the integration of language models into workflows,
    making it easier to create applications that leverage large-scale text
    processing and reasoning.
    """
    documents = [Document(page_content=text)]
    return documents

# CHARACTER-BASED CHUNKING
def character_chunking(documents):
    print("\n========== CHARACTER CHUNKING ==========")
    splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
    chunks = splitter.split_documents(documents)
    for i, chunk in enumerate(chunks, start=1):
        print(f"\nChunk {i}:\n{chunk.page_content}")

# RECURSIVE CHARACTER CHUNKING (BEST PRACTICE)
def recursive_chunking(documents):
    print("\n========== RECURSIVE CHUNKING ==========")
    splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
    chunks = splitter.split_documents(documents)
    for i, chunk in enumerate(chunks, start=1):
        print(f"\nChunk {i}:\n{chunk.page_content}")

# TOKEN-BASED CHUNKING
def token_chunking(documents):
    print("\n========== TOKEN CHUNKING ==========")
    splitter = TokenTextSplitter(chunk_size=30, chunk_overlap=5)
    chunks = splitter.split_documents(documents)
    for i, chunk in enumerate(chunks, start=1):
        print(f"\nChunk {i}:\n{chunk.page_content}")

# Main Execution
if __name__ == "__main__":
    docs = load_document()
    character_chunking(docs)
    recursive_chunking(docs)
    token_chunking(docs)

Created a chunk of size 449, which is longer than the specified 100
Created a chunk of size 339, which is longer than the specified 100




Chunk 1:
Artificial Intelligence (AI) is transforming industries across the globe.
    From healthcare to finance, AI-driven solutions are improving efficiency,
    accuracy, and decision-making. One of the most impactful applications of AI
    is Natural Language Processing (NLP), which enables machines to understand
    and generate human language. Tools like chatbots, virtual assistants, and
    automated translation services rely heavily on NLP.

Chunk 2:
Another critical area is computer vision, where AI systems can interpret
    and analyze visual data. This technology powers facial recognition,
    autonomous vehicles, and medical imaging diagnostics. As AI continues to
    evolve, ethical considerations such as bias, transparency, and privacy
    remain central to its development.

Chunk 3:
Developers and engineers often use frameworks like TensorFlow, PyTorch,
    and LangChain to build AI applications. LangChain, in particular, is
    designed to simplify the integration of