# Different types of chunking methods

## Level 1 - Character Splitting

In [1]:
text = "This is the text I would like to chunk up. It is the example text for this exercise"


In [2]:
# Create a list that will hold your chunks
chunks = []

chunk_size = 35 # Characters

# Run through the a range with the length of your text and iterate every chunk_size you want
for i in range(0, len(text), chunk_size):
    chunk = text[i:i + chunk_size]
    chunks.append(chunk)

chunks

['This is the text I would like to ch',
 'unk up. It is the example text for ',
 'this exercise']

## Level 1.1 - Character Splitting with Overlap

In [3]:
# Create a list that will hold your chunks
chunks = []

chunk_size = 35 # Characters
overlap = 10    # Overlap in characters

# Run through the range with the length of your text and iterate every (chunk_size - overlap)
for i in range(0, len(text), chunk_size - overlap):
    chunk = text[i:i + chunk_size]
    chunks.append(chunk)

chunks

['This is the text I would like to ch',
 'like to chunk up. It is the example',
 'he example text for this exercise',
 'exercise']

## Level 2 - Fixed-length chunks (word level)

In [4]:
from typing import List

# Split the text into units (words, in this case)
def word_splitter(source_text: str) -> List[str]:
    import re
    source_text = re.sub("\s+", " ", source_text)  # Replace multiple whitespces
    return re.split("\s", source_text)  # Split by single whitespace

def get_chunks_fixed_size(text: str, chunk_size: int) -> List[str]:
    text_words = word_splitter(text)
    chunks = []
    for i in range(0, len(text_words), chunk_size):
        chunk_words = text_words[i: i + chunk_size]
        chunk = " ".join(chunk_words)
        chunks.append(chunk)
    return chunks

get_chunks_fixed_size(text, 5)


['This is the text I',
 'would like to chunk up.',
 'It is the example text',
 'for this exercise']

## Level 2 - Fixed-length chunks (word level) with overlap

In [6]:
from typing import List

# Split the text into units (words, in this case)
def word_splitter(source_text: str) -> List[str]:
    import re
    source_text = re.sub("\s+", " ", source_text)  # Replace multiple whitespaces
    return re.split("\s", source_text)  # Split by single whitespace

def get_chunks_fixed_size_with_overlap(text: str, chunk_size: int, overlap_fraction: float) -> List[str]:
    text_words = word_splitter(text)
    overlap_int = int(chunk_size * overlap_fraction)
    if overlap_int >= chunk_size:
        raise ValueError("overlap must be less than chunk_size")
    step = chunk_size - overlap_int
    if step <= 0:
        raise ValueError("overlap_fraction too large for given chunk_size")
    chunks = []
    for i in range(0, len(text_words), step):
        chunk_words = text_words[i: i + chunk_size]
        if chunk_words:  # Only add non-empty chunks
            chunk = " ".join(chunk_words)
            chunks.append(chunk)
    return chunks

get_chunks_fixed_size_with_overlap(text, 5, 0.4)

['This is the text I',
 'text I would like to',
 'like to chunk up. It',
 'up. It is the example',
 'the example text for this',
 'for this exercise']

## Level 3 - Sentence Chunking

In [1]:
import re

def sentence_chunker(text, max_sentences=3, overlap_sentences=1):
    """
    Split text into chunks based on sentences.

    Args:
        text (str): Input text to chunk
        max_sentences (int): Maximum sentences per chunk
        overlap_sentences (int): Number of sentences to overlap between chunks

    Returns:
        list: List of text chunks
    """
    if max_sentences <= 0:
        raise ValueError("max_sentences must be positive")
    if overlap_sentences < 0:
        raise ValueError("overlap_sentences must be non-negative")
    if overlap_sentences >= max_sentences:
        raise ValueError("overlap_sentences must be less than max_sentences")

    # Split text into sentences using regex
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    chunks = []
    start = 0
    step = max_sentences - overlap_sentences
    if step <= 0:
        raise ValueError("overlap_sentences must be less than max_sentences")

    while start < len(sentences):
        end = min(start + max_sentences, len(sentences))
        chunk_sentences = sentences[start:end]
        chunk = '. '.join(chunk_sentences) + '.'
        chunks.append(chunk)
        start += step  # Move start forward by step size

    return chunks

# Example usage
text = "This is the first sentence. This is the second sentence. This is the third sentence. This is the fourth sentence. This is the fifth sentence."
chunks = sentence_chunker(text, max_sentences=3, overlap_sentences=1)
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {chunk}")

Chunk 1: This is the first sentence. This is the second sentence. This is the third sentence.
Chunk 2: This is the third sentence. This is the fourth sentence. This is the fifth sentence.
Chunk 3: This is the fifth sentence.


## Level 4 - Recursive Chunking

In [None]:
from typing import List

def recursive_chunking(text: str, max_chunk_size: int = 1000) -> List[str]:
    # Base case: if text is small enough, return as single chunk
    if len(text) <= max_chunk_size:
        return [text.strip()] if text.strip() else []
    
    # Try separators in priority order
    separators = ["\n\n", "\n", ". ", " "]
    
    for separator in separators:
        if separator in text:
            parts = text.split(separator)
            chunks = []
            current_chunk = ""
            
            for part in parts:
                # Check if adding this part would exceed the limit
                test_chunk = current_chunk + separator + part if current_chunk else part
                
                if len(test_chunk) <= max_chunk_size:
                    current_chunk = test_chunk
                else:
                    # Save current chunk and start new one
                    if current_chunk:
                        chunks.append(current_chunk.strip())
                    current_chunk = part
            
            # Add the final chunk
            if current_chunk:
                chunks.append(current_chunk.strip())
            
            # Recursively process any chunks that are still too large
            final_chunks = []
            for chunk in chunks:
                if len(chunk) > max_chunk_size:
                    final_chunks.extend(recursive_chunking(chunk, max_chunk_size))
                else:
                    final_chunks.append(chunk)
            
            return [chunk for chunk in final_chunks if chunk]
    
    # Fallback: split by character limit if no separators work
    return [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]


sample_text = """
Artificial intelligence (AI) is transforming industries worldwide. From healthcare to finance, AI-driven solutions are improving efficiency and accuracy.

However, ethical considerations remain crucial. As technology evolves, so must our understanding of its impact. This text is provided to test recursive chunking functions.

In recent years, machine learning models have achieved remarkable results in natural language processing, computer vision, and robotics. Despite these advances, challenges such as data privacy, algorithmic bias, and transparency persist.

Addressing these issues is essential for building trust in AI systems and ensuring their responsible deployment across society.
"""

chunks = recursive_chunking(sample_text, max_chunk_size=500)
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")

Chunk 1:
Artificial intelligence (AI) is transforming industries worldwide. From healthcare to finance, AI-driven solutions are improving efficiency and accuracy.

Chunk 2:
However, ethical considerations remain crucial. As technology evolves, so must our understanding of its impact. This text is provided to test recursive chunking functions.

Chunk 3:
In recent years, machine learning models have achieved remarkable results in natural language processing, computer vision, and robotics. Despite these advances, challenges such as data privacy, algorithmic bias, and transparency persist.

Chunk 4:
Addressing these issues is essential for building trust in AI systems and ensuring their responsible deployment across society.



## Level 5 - Document-Based Chunking

In [5]:
from typing import List
import re

def markdown_document_chunking(text: str) -> List[str]:
    # Split by markdown headers (# ## ### etc.)
    header_pattern = r'^#{1,6}\s+.+$'
    lines = text.split('\n')
    
    chunks = []
    current_chunk = []
    
    for line in lines:
        # Check if this line is a header
        if re.match(header_pattern, line, re.MULTILINE):
            # Save previous chunk if it has content
            if current_chunk:
                chunk_text = '\n'.join(current_chunk).strip()
                if chunk_text:
                    chunks.append(chunk_text)
            
            # Start new chunk with this header
            current_chunk = [line]
        else:
            # Add line to current chunk
            current_chunk.append(line)
    
    # Add final chunk
    if current_chunk:
        chunk_text = '\n'.join(current_chunk).strip()
        if chunk_text:
            chunks.append(chunk_text)
    
    return chunks


text = """
# Introduction
This is the introduction section.

## Background
Some background information.

### Details
More detailed information.

# Conclusion
Final thoughts.
"""

markdown_document_chunking(text)




['# Introduction\nThis is the introduction section.',
 '## Background\nSome background information.',
 '### Details\nMore detailed information.',
 '# Conclusion\nFinal thoughts.']

# Exercises: Advanced Chunking Techniques

## Exercise 1: LLM-based Chunking

Use an LLM to determine optimal chunk boundaries in a text.

**Example Prompt:**

```python
boundary_prompt = f"""
Analyze the following text and identify the best places to split it into chunks of approximately {chunk_size} characters.

Consider:
- Semantic coherence (keep related ideas together)
- Context preservation (maintain meaning across chunks)
- Natural boundaries (paragraphs, topics, concepts)

Text: {text}

Return only the character positions where splits should occur, separated by commas.
Example: 150, 300, 450
"""
```

---

## Exercise 2: Semantic Chunking

Implement a function that splits a text into chunks based on semantic similarity.  
For example, you can use sentence embeddings to group sentences into semantically coherent chunks.

**Example (pseudo-code):**

```python
def semantic_chunking(text, max_chunk_size):
    """
    Split the text into semantically coherent chunks, each up to max_chunk_size characters.
    Use sentence embeddings to group similar sentences together.
    Return a list of text chunks.
    """
    # 1. Split text into sentences
    # 2. Compute embeddings for each sentence
    # 3. Group sentences into chunks based on similarity and size constraint
    # 4. Return list of chunks
    pass
```
