In [9]:
import re

In [None]:
document_A = """Q: What is the return policy?
A: Items can be returned within 30 days of purchase with original receipt.

Q: Do you offer international shipping?
A: Yes, we ship to over 50 countries worldwide. Shipping times vary by location.

Q: How do I track my order?
A: Use the tracking number sent to your email after shipment."""

document_B = """Installation Guide

Step 1: Download the installer from our website. Extract the zip file to your desired location.

Step 2: Run setup.exe as administrator. Follow the on-screen instructions.

Step 3: Configure your API key in the settings file. The settings file is located at config/settings.json."""

document_C = """The Future of Renewable Energy

Solar and wind power have seen tremendous growth in recent years. As technology improves and costs decrease, renewable energy becomes increasingly competitive with fossil fuels.

Energy storage solutions are critical for renewable adoption. Battery technology advances enable better grid management and reliability. This addresses the intermittent nature of solar and wind power.

Policy support and public awareness continue to drive the transition. Many countries have set ambitious renewable energy targets for the coming decades."""


In [7]:
def chunk_by_words(text, chunk_size=50, overlap=10):
    """Split text into chunks of specified word count."""
    words = text.split()
    chunks = []
    start = 0
    
    while start < len(words):
        end = start + chunk_size
        chunk_words = words[start:end]
        chunk = ' '.join(chunk_words)
        chunks.append(chunk)
        start += chunk_size - overlap
    
    return chunks

print("Chunking Document A by Words:")
for i, chunk in enumerate(chunk_by_words(document_A, chunk_size=20, overlap=5)):
        print(f"Chunk {i+1}:\n{chunk}\n")

Chunking Document A by Words:
Chunk 1:
Q: What is the return policy? A: Items can be returned within 30 days of purchase with original receipt. Q:

Chunk 2:
purchase with original receipt. Q: Do you offer international shipping? A: Yes, we ship to over 50 countries worldwide. Shipping

Chunk 3:
over 50 countries worldwide. Shipping times vary by location. Q: How do I track my order? A: Use the tracking

Chunk 4:
order? A: Use the tracking number sent to your email after shipment.



In [10]:
def chunk_by_sentences(text, max_chunk_size=500):
    """Split text into chunks by sentences, keeping sentences intact."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk += " " + sentence if current_chunk else sentence
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks
print("Chunking Document B by Sentences:")
for i, chunk in enumerate(chunk_by_sentences(document_B, max_chunk_size=100)):  
        print(f"Chunk {i+1}:\n{chunk}\n")

Chunking Document B by Sentences:
Chunk 1:
Installation Guide

Step 1: Download the installer from our website.

Chunk 2:
Extract the zip file to your desired location. Step 2: Run setup.exe as administrator.

Chunk 3:
Follow the on-screen instructions. Step 3: Configure your API key in the settings file.

Chunk 4:
The settings file is located at config/settings.json.



In [12]:
def chunk_by_paragraphs(text, min_chunk_size=100):
    """Split text by paragraphs (double newlines)."""
    paragraphs = text.split('\n\n')
    chunks = []
    current_chunk = ""
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        
        if len(para) < min_chunk_size:
            current_chunk += "\n\n" + para if current_chunk else para
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = para
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks
print("Chunking Document C by Paragraphs:")
for i, chunk in enumerate(chunk_by_paragraphs(document_C, min_chunk_size=50)):
        print(f"Chunk {i+1}:\n{chunk}\n")

Chunking Document C by Paragraphs:
Chunk 1:
The Future of Renewable Energy

Chunk 2:
Solar and wind power have seen tremendous growth in recent years. As technology improves and costs decrease, renewable energy becomes increasingly competitive with fossil fuels.

Chunk 3:
Energy storage solutions are critical for renewable adoption. Battery technology advances enable better grid management and reliability. This addresses the intermittent nature of solar and wind power.

Chunk 4:
Policy support and public awareness continue to drive the transition. Many countries have set ambitious renewable energy targets for the coming decades.



In [14]:
strategy_A = "Word-based chunking"
reason_A = """
Each Q&A pair is relatively short and self-contained. Word-based chunking with 
moderate chunk size (30-40 words) ensures that each complete Q&A pair stays together,
making it easy for users to find answers to specific questions. This is precise and
doesn't split questions from answers.
"""

strategy_B = "Paragraph-based chunking"
reason_B = """
Technical documentation is typically well-structured with clear sections and steps.
Each step is a logical unit. Paragraph-based chunking respects these natural boundaries,
keeping instructions together. This helps users find complete instructions without 
fragments. It's more semantic than word/character-based approaches.
"""
strategy_C = "Sentence-based chunking"
reason_C = """
Articles and narratives contain flowing text across paragraphs with complex 
inter-relationships. Sentence-based chunking provides a good balance:
- Preserves complete thoughts (sentences)
- Captures context better than word-based
- More flexible than paragraph-based (handles variable paragraph lengths)
- Ideal for narrative content where ideas span multiple paragraphs
This is the recommended approach for general content like articles.
"""