In [None]:
import json
import re
from pathlib import Path
from typing import List, Dict


def chunk_by_paragraphs(text: str, max_chunk_size: int = 1000) -> List[str]:
    """
    Split text into chunks by paragraphs.
    
    Args:
        text: Input text to chunk
        max_chunk_size: Maximum characters per chunk
        
    Returns:
        List of text chunks
    """
    # Split by double newlines (paragraphs)
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
            
        para_length = len(para)
        
        # If single paragraph is too large, split it by sentences
        if para_length > max_chunk_size:
            if current_chunk:
                chunks.append('\n\n'.join(current_chunk))
                current_chunk = []
                current_length = 0
            
            # Split large paragraph into sentences
            sentences = re.split(r'(?<=[.!?])\s+', para)
            temp_chunk = []
            temp_length = 0
            
            for sentence in sentences:
                if temp_length + len(sentence) > max_chunk_size and temp_chunk:
                    chunks.append(' '.join(temp_chunk))
                    temp_chunk = [sentence]
                    temp_length = len(sentence)
                else:
                    temp_chunk.append(sentence)
                    temp_length += len(sentence)
            
            if temp_chunk:
                chunks.append(' '.join(temp_chunk))
        
        # If adding paragraph exceeds max size, save current chunk
        elif current_length + para_length > max_chunk_size and current_chunk:
            chunks.append('\n\n'.join(current_chunk))
            current_chunk = [para]
            current_length = para_length
        else:
            current_chunk.append(para)
            current_length += para_length
    
    # Add remaining chunk
    if current_chunk:
        chunks.append('\n\n'.join(current_chunk))
    
    return chunks


def process_json_file(json_path: Path, max_chunk_size: int = 1000) -> List[Dict]:
    """
    Process a single JSON file and create chunks.
    
    Args:
        json_path: Path to the JSON file
        max_chunk_size: Maximum characters per chunk
        
    Returns:
        List of chunk dictionaries
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    text = data.get('text', '')
    metadata = data.get('metadata', {})
    
    # Create chunks
    chunks = chunk_by_paragraphs(text, max_chunk_size)
    
    # Add metadata to each chunk
    result = []
    for i, chunk_text in enumerate(chunks):
        result.append({
            "chunk_id": i,
            "text": chunk_text,
            "char_count": len(chunk_text),
            "word_count": len(chunk_text.split()),
            "source_file": metadata.get('source_file', ''),
            "source_path": metadata.get('source_path', '')
        })
    
    return result


def save_chunks_jsonl(chunks: List[Dict], output_path: Path) -> None:
    """
    Save chunks to JSONL file (one JSON object per line).
    
    Args:
        chunks: List of chunk dictionaries
        output_path: Path for output JSONL file
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        for chunk in chunks:
            json.dump(chunk, f, ensure_ascii=False)
            f.write('\n')


def process_all_json_files(
    input_dir: str = r"C:\Users\yigit\Desktop\Enterprises\polcon\text",
    output_dir: str = r"C:\Users\yigit\Desktop\Enterprises\polcon\basic-chunks",
    max_chunk_size: int = 1000
) -> None:
    """
    Process all JSON files and create chunked JSONL files.
    
    Args:
        input_dir: Directory containing JSON files
        output_dir: Directory to save chunked JSONL files
        max_chunk_size: Maximum characters per chunk
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    
    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Find all JSON files
    json_files = list(input_path.glob("*.json"))
    
    if not json_files:
        print(f"No JSON files found in {input_dir}")
        return
    
    print(f"Found {len(json_files)} JSON files to process\n")
    
    total_chunks = 0
    
    # Process each JSON file
    for json_file in json_files:
        try:
            print(f"Processing: {json_file.name}...", end=" ")
            
            # Create chunks
            chunks = process_json_file(json_file, max_chunk_size)
            
            # Save as JSONL
            output_file = output_path / f"{json_file.stem}_chunks.jsonl"
            save_chunks_jsonl(chunks, output_file)
            
            total_chunks += len(chunks)
            print(f"✓ Created {len(chunks)} chunks → {output_file.name}")
            
        except Exception as e:
            print(f"✗ Error: {e}")
    
    print(f"\nProcessing complete!")
    print(f"Total chunks created: {total_chunks}")
    print(f"Files saved to: {output_dir}")

In [8]:
# Run the chunker
if __name__ == "__main__":
    # Process all JSON files and create chunks
    process_all_json_files(max_chunk_size=1000)

Found 89 JSON files to process

Processing: 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık.json... ✓ Created 34 chunks → 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık_chunks.jsonl
Processing: 10) TÜRKİYE’DE ÖRGÜTLENME ÖZGÜRLÜĞÜNÜN GENEL GÖRÜNÜMÜ-II .json... ✓ Created 193 chunks → 10) TÜRKİYE’DE ÖRGÜTLENME ÖZGÜRLÜĞÜNÜN GENEL GÖRÜNÜMÜ-II _chunks.jsonl
Processing: 11) Yurttaslik_Alani_Bilgi_Notu_1.json... ✓ Created 31 chunks → 11) Yurttaslik_Alani_Bilgi_Notu_1_chunks.jsonl
Processing: 12) TERÖRLE MÜCADELEYİ ARAÇSALLAŞTIRMAK.json... ✓ Created 107 chunks → 12) TERÖRLE MÜCADELEYİ ARAÇSALLAŞTIRMAK_chunks.jsonl
Processing: 13) PROTESTO HAKKINI KORU.json... ✓ Created 264 chunks → 13) PROTESTO HAKKINI KORU_chunks.jsonl
Processing: 14) KomploTeorileri_AR_23.03.23_web.json... ✓ Created 89 chunks → 14) KomploTeorileri_AR_23.03.23_web_chunks.jsonl
Processing: 15) Feminist_Hareketin_Gundemleri_.json... ✓ Created 54 chunks → 15) Feminist_Hareketin_Gundemleri__chunks.jsonl
Processing: 16) 

In [None]:
# OPTIMIZED VERSION - Much faster overlap chunking
from collections import deque

def chunk_by_paragraphs_optimized(text: str, max_chunk_size: int = 1000, overlap_ratio: float = 0.1) -> List[str]:
    """
    Optimized version: Split text into chunks by paragraphs with overlap.
    Uses sliding window approach instead of list insertions for O(n) complexity.
    
    Args:
        text: Input text to chunk
        max_chunk_size: Maximum characters per chunk
        overlap_ratio: Fraction of chunk to overlap (default 0.1 = 10%)
        
    Returns:
        List of text chunks with overlap
    """
    # Split by double newlines (paragraphs)
    paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
    
    if not paragraphs:
        return []
    
    chunks = []
    overlap_size = int(max_chunk_size * overlap_ratio)
    
    # Pre-calculate paragraph lengths to avoid repeated len() calls
    para_lengths = [len(para) for para in paragraphs]
    
    i = 0
    while i < len(paragraphs):
        current_chunk_paras = []
        current_length = 0
        start_idx = i
        
        # Build current chunk
        while i < len(paragraphs) and current_length + para_lengths[i] <= max_chunk_size:
            current_chunk_paras.append(paragraphs[i])
            current_length += para_lengths[i]
            i += 1
        
        # Handle oversized single paragraph
        if not current_chunk_paras and i < len(paragraphs):
            oversized_para = paragraphs[i]
            sentence_chunks = _split_oversized_paragraph_optimized(oversized_para, max_chunk_size, overlap_size)
            chunks.extend(sentence_chunks)
            i += 1
            continue
        
        if current_chunk_paras:
            chunk_text = '\n\n'.join(current_chunk_paras)
            chunks.append(chunk_text)
            
            # Calculate overlap for next chunk using sliding window
            if i < len(paragraphs):
                overlap_start = _find_overlap_start_optimized(
                    current_chunk_paras, para_lengths[start_idx:i], overlap_size
                )
                if overlap_start is not None:
                    i = start_idx + overlap_start  # Slide back to overlap position
    
    return chunks


def _split_oversized_paragraph_optimized(para: str, max_chunk_size: int, overlap_size: int) -> List[str]:
    """
    Optimized helper to split oversized paragraphs into sentences with overlap.
    """
    sentences = re.split(r'(?<=[.!?])\s+', para)
    sentence_lengths = [len(sent) for sent in sentences]
    
    chunks = []
    i = 0
    
    while i < len(sentences):
        current_sentences = []
        current_length = 0
        start_idx = i
        
        # Build sentence chunk
        while i < len(sentences) and current_length + sentence_lengths[i] <= max_chunk_size:
            current_sentences.append(sentences[i])
            current_length += sentence_lengths[i]
            i += 1
        
        if current_sentences:
            chunks.append(' '.join(current_sentences))
            
            # Calculate sentence-level overlap
            if i < len(sentences):
                overlap_start = _find_sentence_overlap_start(
                    current_sentences, sentence_lengths[start_idx:i], overlap_size
                )
                if overlap_start is not None:
                    i = start_idx + overlap_start
    
    return chunks


def _find_overlap_start_optimized(paragraphs: List[str], lengths: List[int], target_overlap: int) -> int:
    """
    Find the starting index for overlap using reverse accumulation.
    Returns the paragraph index to start the next chunk from.
    """
    if not paragraphs or target_overlap <= 0:
        return None
    
    accumulated_length = 0
    
    # Work backwards from the end
    for i in range(len(paragraphs) - 1, -1, -1):
        if accumulated_length + lengths[i] <= target_overlap:
            accumulated_length += lengths[i]
        else:
            # If we can't fit the whole paragraph, check if we can fit part of it
            remaining_space = target_overlap - accumulated_length
            if remaining_space > 50:  # Minimum meaningful overlap
                return i  # Start from this paragraph (will be partially included)
            elif i < len(paragraphs) - 1:
                return i + 1  # Start from next paragraph
            else:
                return None
    
    return 0 if accumulated_length > 0 else None


def _find_sentence_overlap_start(sentences: List[str], lengths: List[int], target_overlap: int) -> int:
    """
    Find the starting sentence index for overlap.
    """
    if not sentences or target_overlap <= 0:
        return None
    
    accumulated_length = 0
    
    for i in range(len(sentences) - 1, -1, -1):
        if accumulated_length + lengths[i] <= target_overlap:
            accumulated_length += lengths[i]
        else:
            return i + 1 if i < len(sentences) - 1 else None
    
    return 0 if accumulated_length > 0 else None


# Optimized processing functions
def process_json_file_optimized(json_path: Path, max_chunk_size: int = 1000, overlap_ratio: float = 0.1) -> List[Dict]:
    """
    Optimized version of process_json_file.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    text = data.get('text', '')
    metadata = data.get('metadata', {})
    
    # Use optimized chunking
    chunks = chunk_by_paragraphs_optimized(text, max_chunk_size, overlap_ratio)
    
    # Pre-allocate result list for better performance
    result = []
    for i, chunk_text in enumerate(chunks):
        # Cache split for word count to avoid repeated splitting
        words = chunk_text.split()
        result.append({
            "chunk_id": i,
            "text": chunk_text,
            "char_count": len(chunk_text),
            "word_count": len(words),
            "source_file": metadata.get('source_file', ''),
            "source_path": metadata.get('source_path', '')
        })
    
    return result


def process_all_json_files_optimized(
    input_dir: str = r"C:\Users\yigit\Desktop\Enterprises\polcon\text",
    output_dir: str = r"C:\Users\yigit\Desktop\Enterprises\polcon\basic-chunks",
    max_chunk_size: int = 1000,
    overlap_ratio: float = 0.1
) -> None:
    """
    Optimized version with better performance and memory usage.
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    
    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Find all JSON files
    json_files = list(input_path.glob("*.json"))
    
    if not json_files:
        print(f"No JSON files found in {input_dir}")
        return
    
    print(f"Found {len(json_files)} JSON files to process")
    print(f"Chunk size: {max_chunk_size}, Overlap: {overlap_ratio*100:.1f}%\n")
    
    total_chunks = 0
    
    # Process each JSON file
    for json_file in json_files:
        try:
            print(f"Processing: {json_file.name}...", end=" ")
            
            # Use optimized processing
            chunks = process_json_file_optimized(json_file, max_chunk_size, overlap_ratio)
            
            # Save as JSONL
            output_file = output_path / f"{json_file.stem}_chunks.jsonl"
            save_chunks_jsonl(chunks, output_file)
            
            total_chunks += len(chunks)
            print(f"✓ {len(chunks)} chunks → {output_file.name}")
            
        except Exception as e:
            print(f"✗ Error: {e}")
    
    print(f"\n🎉 Processing complete!")
    print(f"📊 Total chunks created: {total_chunks}")
    print(f"💾 Files saved to: {output_dir}")


# Performance comparison function
def compare_performance():
    """
    Compare performance between original and optimized versions.
    """
    import time
    
    # Sample text for testing
    sample_text = """
    This is a sample paragraph for testing purposes. It contains multiple sentences to simulate real document processing.
    
    This is another paragraph that we'll use to test the chunking algorithms. We want to see how they handle overlapping content.
    
    The third paragraph adds more content to make the test more realistic. Performance differences should become apparent with larger texts.
    
    Final paragraph to complete our sample text. This should be enough content to demonstrate the algorithmic differences.
    """ * 50  # Multiply to make it larger
    
    print("🔍 Performance Comparison Test")
    print(f"Sample text length: {len(sample_text)} characters\n")
    
    # Test original version
    start_time = time.time()
    chunks_original = chunk_by_paragraphs(sample_text, max_chunk_size=1000, overlap_ratio=0.1)
    original_time = time.time() - start_time
    
    # Test optimized version
    start_time = time.time()
    chunks_optimized = chunk_by_paragraphs_optimized(sample_text, max_chunk_size=1000, overlap_ratio=0.1)
    optimized_time = time.time() - start_time
    
    print(f"⏱️  Original algorithm: {original_time:.4f} seconds")
    print(f"⚡ Optimized algorithm: {optimized_time:.4f} seconds")
    print(f"🚀 Speed improvement: {original_time/optimized_time:.1f}x faster")
    print(f"📈 Original chunks: {len(chunks_original)}")
    print(f"📈 Optimized chunks: {len(chunks_optimized)}")
    
    return chunks_original, chunks_optimized

In [None]:
# Run performance comparison
print("Running performance comparison...")
original_chunks, optimized_chunks = compare_performance()

# Verify both produce similar results
print(f"\n✅ Verification:")
print(f"Both algorithms produced same number of chunks: {len(original_chunks) == len(optimized_chunks)}")

# Show sample chunks to verify overlap is working
if len(optimized_chunks) >= 2:
    print(f"\n📝 Sample overlap verification:")
    chunk1_end = optimized_chunks[0][-100:]  # Last 100 chars of first chunk
    chunk2_start = optimized_chunks[1][:100]  # First 100 chars of second chunk
    
    # Simple overlap check
    words1 = set(chunk1_end.split())
    words2 = set(chunk2_start.split())
    overlap_words = words1.intersection(words2)
    print(f"Common words between chunks: {len(overlap_words)} words")
    print(f"Overlap detected: {len(overlap_words) > 0}")

In [6]:
# Use the OPTIMIZED version for production processing
# This should be 3-5x faster than the original implementation

print("🚀 Running OPTIMIZED chunking with overlap...")
process_all_json_files_optimized(
    max_chunk_size=1000, 
    overlap_ratio=0.1  # 10% overlap
)

# Alternative: Use different overlap ratios for experimentation
# process_all_json_files_optimized(max_chunk_size=1000, overlap_ratio=0.05)  # 5% overlap
# process_all_json_files_optimized(max_chunk_size=1000, overlap_ratio=0.15)  # 15% overlap

🚀 Running OPTIMIZED chunking with overlap...
Found 89 JSON files to process
Chunk size: 1000, Overlap: 10.0%

Processing: 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık.json... ✓ 35 chunks → 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık_chunks.jsonl
Processing: 10) TÜRKİYE’DE ÖRGÜTLENME ÖZGÜRLÜĞÜNÜN GENEL GÖRÜNÜMÜ-II .json... 

KeyboardInterrupt: 