In [None]:
%pip install --upgrade --quiet langchain-text-splitters tiktoken

In [2]:
# Demonstration of Various Methods in LangChain's Text Splitter

from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from transformers import AutoTokenizer

In [3]:
# Sample Text and Loader Setup
txt_file_loader = TextLoader("../../00-example_data/state_of_the_union.txt")
txt_file_documents = txt_file_loader.load()

In [None]:
txt_file_documents

In [None]:
len(txt_file_documents)

In [6]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
s_text = text_splitter.split_documents(txt_file_documents)

In [None]:
len(s_text)

In [None]:
s_text

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter

web_loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
web_documents = web_loader.load()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(web_documents)


print(f"Generated {len(split_docs)} documents.")

In [None]:
web_documents

In [None]:
split_docs

In [None]:
# Create Documents
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = ["LangChain enables easier integration of LLMs.", "Text splitters enhance text processing."]
metadatas = [{"source": "example1"}, {"source": "example2"}]
created_docs = text_splitter.create_documents(texts, metadatas=metadatas)
print("Created Documents:", created_docs)

In [None]:
# From HuggingFace Tokenizer
text = """LangChain provides a suite of tools for building language model applications. 
Text splitting is a key preprocessing step for managing large texts."""

huggingface_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
hf_text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer=huggingface_tokenizer, chunk_size=100, chunk_overlap=10
)
hf_chunks_text = hf_text_splitter.split_text(text)
print("HuggingFace Tokenizer Chunks from text:", hf_chunks_text)

hf_chunks_documents = hf_text_splitter.split_documents(txt_file_documents)
print("HuggingFace Tokenizer Chunks from documents:", hf_chunks_documents)


In [None]:
len(hf_chunks_text)

In [None]:
len(hf_chunks_documents)

In [None]:
# Transform Documents
transformed_docs = text_splitter.transform_documents(txt_file_documents)
print("Transformed Documents:", len(transformed_docs))

In [18]:
# Transform Documents (Asynchronously)
async def async_transform_documents_demo():
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    transformed_docs = await text_splitter.atransform_documents(txt_file_documents)
    print("Transformed Documents (Async):", len(transformed_docs))