In [1]:
from langchain_community.document_loaders import TextLoader

In [2]:
documents = TextLoader("../data/sample.txt").load()

In [3]:
documents[0].page_content

'Terms And Conditions\nThese Terms of Use ("Terms") constitute an enforceable contract between you and Euron ("Euron", "we", or "our"), a subsidiary of Engage Sphere Technology Private Limited. By accessing or using our website, mobile applications, and related services (collectively, "Services"), you agree to be bound by these Terms. Please review them carefully as they contain important information about your legal rights, remedies, and obligations.\n\nTable of Contents\nAccounts\nCommunications\nContent Enrollment and Access\nPayments and Refunds\nDigital Product Access & Shipping Policy\nContent and Behavior Rules\nEuron\'s Rights to Content You Post\nUsing Euron at Your Own Risk\nEuron\'s Rights\nSubscription Terms\nMiscellaneous Legal Terms\nDispute Resolution\nUpdating These Terms\nHow to Contact Us\n1. Accounts\nYou need an account for most activities on our platform. Keep your password somewhere safe because you\'re responsible for all activity associated with your account. If

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
spliter = RecursiveCharacterTextSplitter(
	chunk_size=50, chunk_overlap=10, separators=["\n\n", "\n", ".", "?", "!", " ", ""]
)
chunks = spliter.split_documents(documents)
chunks[:5]

[Document(metadata={'source': '../data/sample.txt'}, page_content='Terms And Conditions'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='These Terms of Use ("Terms") constitute an'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='an enforceable contract between you and Euron'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='and Euron ("Euron", "we", or "our"), a subsidiary'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='of Engage Sphere Technology Private Limited')]

In [8]:
from langchain_text_splitters import CharacterTextSplitter

In [9]:
char_spliter = CharacterTextSplitter(separator=" ", chunk_size=50, chunk_overlap=10)
char_spliter.split_documents(documents)

[Document(metadata={'source': '../data/sample.txt'}, page_content='Terms And Conditions\nThese Terms of Use ("Terms")'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='("Terms") constitute an enforceable contract'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='contract between you and Euron ("Euron", "we", or'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='"we", or "our"), a subsidiary of Engage Sphere'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='Sphere Technology Private Limited. By accessing or'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='or using our website, mobile applications, and'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='and related services (collectively, "Services"),'),
 Document(metadata={'source': '../data/sample.txt'}, page_content='you agree to be bound by these Terms. Please'),
 Document(metadata={'source': '../data/sample.txt'}, pa

In [10]:
from langchain_text_splitters import TokenTextSplitter

token_text_splitter = TokenTextSplitter(chunk_size=50, chunk_overlap=10)

In [11]:
token_text_splitter.split_documents(documents)

[Document(metadata={'source': '../data/sample.txt'}, page_content='Terms And Conditions\nThese Terms of Use ("Terms") constitute an enforceable contract between you and Euron ("Euron", "we", or "our"), a subsidiary of Engage Sphere Technology Private Limited. By accessing or using our website'),
 Document(metadata={'source': '../data/sample.txt'}, page_content=' Technology Private Limited. By accessing or using our website, mobile applications, and related services (collectively, "Services"), you agree to be bound by these Terms. Please review them carefully as they contain important information about your legal rights, remedies, and'),
 Document(metadata={'source': '../data/sample.txt'}, page_content=' important information about your legal rights, remedies, and obligations.\n\nTable of Contents\nAccounts\nCommunications\nContent Enrollment and Access\nPayments and Refunds\nDigital Product Access & Shipping Policy\nContent and Behavior Rules\nE'),
 Document(metadata={'source': '../dat

## add metadata

In [12]:
documents[0].metadata

{'source': '../data/sample.txt'}

In [16]:
type(documents[0])

langchain_core.documents.base.Document

In [19]:
documents[0].metadata["creator"] = "Tejas"

In [20]:
documents[0].metadata

{'source': '../data/sample.txt', 'creator': 'Tejas'}

## Chunk size recommendation

| Use case       | Ideal chunk size |
|----------------|------------------|
| RAG Search     | 300-512          |
| Summarization  | 512-10124        |
| Q&A            | 500-1000         |
| Classification | 100-300          |