### Introduction to Data Ingestion 

In [1]:
import os
from typing import List, Dict, Any
import pandas as pd

In [2]:
from langchain_core.documents import Document
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter, 
    CharacterTextSplitter, 
    TokenTextSplitter
)
print("Set Up Complete!")

Set Up Complete!


### Uderstanding Document Structure in Langchain

In [3]:
#create a simple document

doc = Document(
    page_content="This is the main text content that will be embedded and searched.", 
    metadata = {
        "source":"example.txt", 
        "page":1,
        "author":"Tejaswi Shetty",
        'date_created':"2025-08-14",
        "custom_field":"any_value"
    }
)
print("Document Structure")

print(f"Content:{doc.page_content}")
print(f"Metadata:{doc.metadata}")

Document Structure
Content:This is the main text content that will be embedded and searched.
Metadata:{'source': 'example.txt', 'page': 1, 'author': 'Tejaswi Shetty', 'date_created': '2025-08-14', 'custom_field': 'any_value'}


In [4]:
type(doc)

langchain_core.documents.base.Document

### Text file(.txt) - The simple Case

In [5]:
#create a simple text file

import os
os.makedirs("data/text_files", exist_ok=True)

In [6]:
sample_texts = {
    "data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
"data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
"""
}

for file_path, content in sample_texts.items():
    with open(file_path,'w', encoding="utf-8") as f:
        f.write(content)

print("Sample file created successfully!")

Sample file created successfully!


### TextLoader -  Read Single File

In [7]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")
documents = loader.load()
print(type(documents))
print(documents)

print(f"Loaded {len(documents)} document")
print(f"Content preview:{documents[0].page_content[:100]}...")
print(f"Metadata: {documents[0].metadata}")

<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]
Loaded 1 document
Content preview:Python Programming Introduction

Python is a high-level, interpreted programming language known for ...
Metadata: {'source': 'data/text_files/python_intro.txt'}


#### DirectoryLoader - multiple text files

In [8]:
from langchain_community.document_loaders import DirectoryLoader

### load all the text files from the directory
dir_loader = DirectoryLoader(
    "data/text_files", 
    glob = "**/*.txt", #Pattern to match files
    loader_cls = TextLoader, ### loader class to use
    loader_kwargs = {'encoding':'utf-8'},
    show_progress = True
)

documents = dir_loader.load()


print(f"Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\n Documents {i+1}")
    print(f" Source: {doc.metadata['source']}")
    print(f" Length: {len(doc.page_content)} characters")

100%|██████████| 2/2 [00:00<00:00, 180.22it/s]

Loaded 2 documents

 Documents 1
 Source: data\text_files\machine_learning.txt
 Length: 569 characters

 Documents 2
 Source: data\text_files\python_intro.txt
 Length: 489 characters





In [15]:
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter, 
    CharacterTextSplitter, 
    TokenTextSplitter)
print(documents[0].page_content)


Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems



In [26]:
### Method 1 - Character Text Splitter

text = documents[0].page_content

print("Character text splitter")

char_splitter = CharacterTextSplitter(
    separator="\n", 
    chunk_size = 200, 
    chunk_overlap = 20, 
    length_function = len
)

char_chunks = char_splitter.split_text(text)
print(f"Created {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]}...")

Character text splitter
Created 4 chunks
First chunk: Machine Learning Basics
Machine learning is a subset of artificial intelligence that enables systems...


In [27]:
print(char_chunks[0])
print("-----")
print(char_chunks[1])
print("-----")
print(char_chunks[2])

Machine Learning Basics
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
-----
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.
Types of Machine Learning:
-----
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties


In [37]:
### Method 2 - Recursive Character Text Splitter
text = documents[0].page_content

print("Recursive Character text splitter")

recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],  
    chunk_size = 200, 
    chunk_overlap = 20, 
    length_function = len
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Created {len(recursive_chunks)} chunks")
print(f"First chunk: {recursive_chunks[0][:100]}...")

Recursive Character text splitter
Created 6 chunks
First chunk: Machine Learning Basics...


In [38]:
print(recursive_chunks[0])
print("-----")
print(recursive_chunks[1])
print("-----")
print(recursive_chunks[2])

Machine Learning Basics
-----
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
-----
that can access data and use it to learn for themselves.


In [46]:
### Create text without natural break points 

simple_text = simple_text = "This is sentence one and it is quite long. This is sentence two and it is also quite long. This is sentence three which is even longer than the others. This is sentence four. This is sentence five. This is sentence six."

splitter = RecursiveCharacterTextSplitter(
    separators = [" "], 
    chunk_size = 80, 
    chunk_overlap = 20, 
    length_function = len
)

chunks = splitter.split_text(simple_text)

print(f"\n Simple text example - {len(chunks)} chunks\n")

for i in range(len(chunks)-1):
    print(f"Chunks {i+1}: '{chunks[i]}")
    print(f"Chunks {i+2}: '{chunks[i+1]}")
    print()


 Simple text example - 4 chunks

Chunks 1: 'This is sentence one and it is quite long. This is sentence two and it is also
Chunks 2: 'two and it is also quite long. This is sentence three which is even longer than

Chunks 2: 'two and it is also quite long. This is sentence three which is even longer than
Chunks 3: 'is even longer than the others. This is sentence four. This is sentence five.

Chunks 3: 'is even longer than the others. This is sentence four. This is sentence five.
Chunks 4: 'is sentence five. This is sentence six.



In [47]:
#method 3: Token-based splitting

print("/n TOKEN TEXT SPLITTER")
token_splitter = TokenTextSplitter(
    chunk_size = 50, 
    chunk_overlap = 10
)

token_chunks = token_splitter.split_text(text)
print(f"Created {len(token_chunks)} chunks")
print(f"First chunk: {token_chunks[0][:100]} ...")

/n TOKEN TEXT SPLITTER
Created 3 chunks
First chunk: Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables system ..
