### Data Ingestion

#### Document Structure

In [1]:
from langchain_core.documents import Document

In [3]:
doc = Document(
    page_content="this is the main text content I am using to create RAG",
    metadata = {
        "source": "example.txt",
        "pages": 1,
        "author": "Sushmita",
        "date_created": "2026-02-16"
    }
)

In [4]:
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Sushmita', 'date_created': '2026-02-16'}, page_content='this is the main text content I am using to create RAG')

#### Create a simple txt file

In [8]:
import os
os.makedirs("../data/text_files", exist_ok=True)

In [11]:
sample_texts={
   "../data/text_files/python.txt" : '''Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.

It is one of the most popular programming languages in the world and is widely used in:

Web development

Data science

Artificial intelligence

Automation

Cybersecurity

Software development

Game development'''
}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)
        
print("Sample file created")

Sample file created


### TextLoader

In [15]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python.txt", encoding="utf-8")
document = loader.load()

In [16]:
print(document)

[Document(metadata={'source': '../data/text_files/python.txt'}, page_content='Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.\n\nIt is one of the most popular programming languages in the world and is widely used in:\n\nWeb development\n\nData science\n\nArtificial intelligence\n\nAutomation\n\nCybersecurity\n\nSoftware development\n\nGame development')]


### Directory Loader

In [20]:
from langchain_community.document_loaders import DirectoryLoader

# load all the text files from the directory

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", # Pattern to match the files
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=False
    
)

documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\python.txt'}, page_content='Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean syntax and developer productivity.\n\nIt is one of the most popular programming languages in the world and is widely used in:\n\nWeb development\n\nData science\n\nArtificial intelligence\n\nAutomation\n\nCybersecurity\n\nSoftware development\n\nGame development')]