In [1]:
%pip install --upgrade --quiet langchain-text-splitters tiktoken

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Demonstration of Various Methods in LangChain's Text Splitter

from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from transformers import AutoTokenizer

In [3]:
# Sample Text and Loader Setup
txt_file_loader = TextLoader("../../00-example_data/state_of_the_union.txt")
txt_file_documents = txt_file_loader.load()

In [4]:
txt_file_documents

[Document(metadata={'source': '../../00-example_data/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their c

In [5]:
len(txt_file_documents)

1

In [6]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
s_text = text_splitter.split_documents(txt_file_documents)

In [7]:
len(s_text)

42

In [8]:
s_text

[Document(metadata={'source': '../../00-example_data/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their c

In [10]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter

web_loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
web_documents = web_loader.load()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(web_documents)


print(f"Generated {len(split_docs)} documents.")

Created a chunk of size 1003, which is longer than the specified 1000


Generated 14 documents.


In [11]:
web_documents

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final resu

In [12]:
split_docs

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final resu

In [13]:
# Create Documents
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = ["LangChain enables easier integration of LLMs.", "Text splitters enhance text processing."]
metadatas = [{"source": "example1"}, {"source": "example2"}]
created_docs = text_splitter.create_documents(texts, metadatas=metadatas)
print("Created Documents:", created_docs)

Created Documents: [Document(metadata={'source': 'example1'}, page_content='LangChain enables easier integration of LLMs.'), Document(metadata={'source': 'example2'}, page_content='Text splitters enhance text processing.')]


In [14]:
# From HuggingFace Tokenizer
text = """LangChain provides a suite of tools for building language model applications. 
Text splitting is a key preprocessing step for managing large texts."""

huggingface_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
hf_text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer=huggingface_tokenizer, chunk_size=100, chunk_overlap=10
)
hf_chunks_text = hf_text_splitter.split_text(text)
print("HuggingFace Tokenizer Chunks from text:", hf_chunks_text)

hf_chunks_documents = hf_text_splitter.split_documents(txt_file_documents)
print("HuggingFace Tokenizer Chunks from documents:", hf_chunks_documents)


HuggingFace Tokenizer Chunks from text: ['LangChain provides a suite of tools for building language model applications. \nText splitting is a key preprocessing step for managing large texts.']
HuggingFace Tokenizer Chunks from documents: [Document(metadata={'source': '../../00-example_data/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution.'), Document(metadata={'source': '../../00-example_data/state_of_the_union.txt'}, page_content='And with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of th

In [15]:
len(hf_chunks_text)

1

In [16]:
len(hf_chunks_documents)

110

In [17]:
# Transform Documents
transformed_docs = text_splitter.transform_documents(txt_file_documents)
print("Transformed Documents:", len(transformed_docs))

Transformed Documents: 42


In [18]:
# Transform Documents (Asynchronously)
async def async_transform_documents_demo():
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    transformed_docs = await text_splitter.atransform_documents(txt_file_documents)
    print("Transformed Documents (Async):", len(transformed_docs))