In [None]:
# Text-structured based

In [27]:
%pip install -qU langchain-text-splitters

Note: you may need to restart the kernel to use updated packages.


In [28]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load example document
with open("../../00-example_data/state_of_the_union.txt") as f:
    state_of_the_union = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])
print(texts[1])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and'
page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.'


In [29]:
texts

[Document(metadata={}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and'),
 Document(metadata={}, page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.'),
 Document(metadata={}, page_content='Last year COVID-19 kept us apart. This year we are finally together again.'),
 Document(metadata={}, page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.'),
 Document(metadata={}, page_content='With a duty to one another to the American people to the Constitution.'),
 Document(metadata={}, page_content='And with an unwavering resolve that freedom will always triumph over tyranny.'),
 Document(metadata={}, page_content='Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he'),
 Document(metadata={}, page_content='world thinking he could make it bend to his menacing ways. But he badly miscalculated

### Using Word Count as the Length Function

In [30]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Define a custom length function to count words instead of characters
def word_count(text):
    return len(text.split())

# Initialize the RecursiveCharacterTextSplitter with the custom length function
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=20,  # 20 words per chunk
    chunk_overlap=5,  # 5-word overlap
    length_function=word_count,
    is_separator_regex=False,
)

# Load example document
with open("../../00-example_data/state_of_the_union.txt") as f:
    state_of_the_union = f.read()

# Split the document
texts = text_splitter.create_documents([state_of_the_union])
print(f"Chunk 1:\n{texts[0].page_content}\n")
print(f"Chunk 2:\n{texts[1].page_content}\n")


Chunk 1:
Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the

Chunk 2:
the Cabinet. Justices of the Supreme Court. My fellow Americans.



In [31]:
texts

[Document(metadata={}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the'),
 Document(metadata={}, page_content='the Cabinet. Justices of the Supreme Court. My fellow Americans.'),
 Document(metadata={}, page_content='Last year COVID-19 kept us apart. This year we are finally together again.'),
 Document(metadata={}, page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.'),
 Document(metadata={}, page_content='With a duty to one another to the American people to the Constitution.'),
 Document(metadata={}, page_content='And with an unwavering resolve that freedom will always triumph over tyranny.'),
 Document(metadata={}, page_content='Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it'),
 Document(metadata={}, page_content='thinking he could make it bend to his menacing ways. But h

### Using a Regular Expression as a Separator

In [32]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Define a custom regex separator for splitting at punctuation marks
separator_regex = r"[.!?]"

# Initialize the RecursiveCharacterTextSplitter with regex-based separation
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,  # 100 characters per chunk
    chunk_overlap=10,  # 10-character overlap
    length_function=len,
    is_separator_regex=True,  # Use regex for splitting
    separators=[separator_regex],  # Regex for sentence-ending punctuation
)

# Load example document
with open("../../00-example_data/state_of_the_union.txt") as f:
    state_of_the_union = f.read()

# Split the document
texts = text_splitter.create_documents([state_of_the_union])
print(f"Chunk 1:\n{texts[0].page_content}\n")
print(f"Chunk 2:\n{texts[1].page_content}\n")


Chunk 1:
Madam Speaker, Madam Vice President, our First Lady and Second Gentleman

Chunk 2:
. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans



### Combining Multiple Separators with Regex

In [33]:
# Define a regex separator for splitting at multiple punctuation marks
separator_regex = r"[.!?;:]"

# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,  # 150 characters per chunk
    chunk_overlap=30,  # 30-character overlap
    length_function=len,
    is_separator_regex=True,
    separators=[separator_regex],  # Split at multiple punctuation marks
)

# Split the document
texts = text_splitter.create_documents([state_of_the_union])
print(f"Chunk 1:\n{texts[0].page_content}\n")
print(f"Chunk 2:\n{texts[1].page_content}\n")


Chunk 1:
Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court

Chunk 2:
. My fellow Americans.  

Last year COVID-19 kept us apart. This year we are finally together again



### Token-Based Length Function

In [34]:
from transformers import AutoTokenizer

# Load a HuggingFace tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define a token-based length function
def token_length(text):
    return len(tokenizer.encode(text))

# Initialize the RecursiveCharacterTextSplitter with token-based length calculation
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,  # 50 tokens per chunk
    chunk_overlap=10,  # 10-token overlap
    length_function=token_length,
    is_separator_regex=False,  # No regex for separator
)

# Split the document
texts = text_splitter.create_documents([state_of_the_union])
print(f"Chunk 1:\n{texts[0].page_content}\n")
print(f"Chunk 2:\n{texts[1].page_content}\n")


Chunk 1:
Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.

Chunk 2:
Last year COVID-19 kept us apart. This year we are finally together again. 

Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.



In [35]:
# Custom separators for Chinese and Japanese text
custom_separators = [
    "\n\n", "\n", " ", "", ".", "．", "。", ",", "，", "、", "\u200b"  # Includes zero-width space
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=30,
    is_separator_regex=False,
    separators=custom_separators,
)

texts = text_splitter.create_documents([state_of_the_union])
print(f"Chunk 1:\n{texts[0].page_content}\n")
print(f"Chunk 2:\n{texts[1].page_content}\n")


Chunk 1:
Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My

Chunk 2:
of the Supreme Court. My fellow Americans.

