In [None]:
# Text-structured based

In [None]:
%pip install -qU langchain-text-splitters

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load example document
with open("../../00-example_data/state_of_the_union.txt") as f:
    state_of_the_union = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])
print(texts[1])

In [None]:
texts

### Using Word Count as the Length Function

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Define a custom length function to count words instead of characters
def word_count(text):
    return len(text.split())

# Initialize the RecursiveCharacterTextSplitter with the custom length function
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=20,  # 20 words per chunk
    chunk_overlap=5,  # 5-word overlap
    length_function=word_count,
    is_separator_regex=False,
)

# Load example document
with open("../../00-example_data/state_of_the_union.txt") as f:
    state_of_the_union = f.read()

# Split the document
texts = text_splitter.create_documents([state_of_the_union])
print(f"Chunk 1:\n{texts[0].page_content}\n")
print(f"Chunk 2:\n{texts[1].page_content}\n")


In [None]:
texts

### Using a Regular Expression as a Separator

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Define a custom regex separator for splitting at punctuation marks
separator_regex = r"[.!?]"

# Initialize the RecursiveCharacterTextSplitter with regex-based separation
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,  # 100 characters per chunk
    chunk_overlap=10,  # 10-character overlap
    length_function=len,
    is_separator_regex=True,  # Use regex for splitting
    separators=[separator_regex],  # Regex for sentence-ending punctuation
)

# Load example document
with open("../../00-example_data/state_of_the_union.txt") as f:
    state_of_the_union = f.read()

# Split the document
texts = text_splitter.create_documents([state_of_the_union])
print(f"Chunk 1:\n{texts[0].page_content}\n")
print(f"Chunk 2:\n{texts[1].page_content}\n")


### Combining Multiple Separators with Regex

In [None]:
# Define a regex separator for splitting at multiple punctuation marks
separator_regex = r"[.!?;:]"

# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,  # 150 characters per chunk
    chunk_overlap=30,  # 30-character overlap
    length_function=len,
    is_separator_regex=True,
    separators=[separator_regex],  # Split at multiple punctuation marks
)

# Split the document
texts = text_splitter.create_documents([state_of_the_union])
print(f"Chunk 1:\n{texts[0].page_content}\n")
print(f"Chunk 2:\n{texts[1].page_content}\n")


### Token-Based Length Function

In [None]:
from transformers import AutoTokenizer

# Load a HuggingFace tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define a token-based length function
def token_length(text):
    return len(tokenizer.encode(text))

# Initialize the RecursiveCharacterTextSplitter with token-based length calculation
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,  # 50 tokens per chunk
    chunk_overlap=10,  # 10-token overlap
    length_function=token_length,
    is_separator_regex=False,  # No regex for separator
)

# Split the document
texts = text_splitter.create_documents([state_of_the_union])
print(f"Chunk 1:\n{texts[0].page_content}\n")
print(f"Chunk 2:\n{texts[1].page_content}\n")
