t

In [None]:
#!/usr/bin/env python3
"""
RST (reStructuredText) AST Chunker

This module implements semantic chunking for RST files using docutils for parsing.
It follows the same patterns as the existing Python and TypeScript AST chunkers.
"""

import sys
import string
import secrets
import tiktoken
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
from docutils import core, nodes
from docutils.frontend import OptionParser
from docutils.utils import new_document
from docutils.parsers.rst import Parser


# =============================================================================
# CONFIGURATION AND CONSTANTS
# =============================================================================

MAX_CHUNK_TOKENS = 1000
TARGET_TOKENS = 600


# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class RSTChunk:
    """Represents a semantic chunk of RST content"""
    start_line: int
    end_line: int
    content: str
    node_type: str
    name: str
    depth: int
    level: int = 0  # For section hierarchy (0=title, 1=section, 2=subsection, etc.)
    token_count: int = 0
    
    def __post_init__(self):
        if self.token_count == 0:
            self.token_count = count_tokens(self.content)


@dataclass
class RSTReference:
    """Represents a reference/include in RST (similar to imports)"""
    reference_type: str  # 'include', 'image', 'figure', 'literalinclude', etc.
    target: str
    line_number: int
    directive: str


# =============================================================================
# TOKEN COUNTING
# =============================================================================

def count_tokens(content: str) -> int:
    """Count tokens using tiktoken for GPT-4"""
    try:
        encoding = tiktoken.encoding_for_model("gpt-4")
        return len(encoding.encode(content))
    except Exception:
        # Fallback: rough estimation (1 token ≈ 4 characters)
        return len(content) // 4


# =============================================================================
# RST PARSING AND ANALYSIS
# =============================================================================

def remove_include_directives(rst_content: str) -> str:
    """
    Remove or comment out include directives to prevent file resolution errors.
    This allows parsing to proceed without trying to resolve include files.
    """
    lines = rst_content.split('\n')
    processed_lines = []
    i = 0
    
    while i < len(lines):
        line = lines[i]
        
        # Check for include directive
        if line.strip().startswith('.. include::'):
            # Comment out the include directive
            processed_lines.append(f".. # INCLUDE DISABLED: {line.strip()}")
            i += 1
            
            # Also comment out any options that follow
            while i < len(lines) and lines[i].startswith('   :'):
                processed_lines.append(f".. # INCLUDE OPTION: {lines[i].strip()}")
                i += 1
        else:
            processed_lines.append(line)
            i += 1
    
    return '\n'.join(processed_lines)


def parse_rst_file(file_path: Path) -> Tuple[nodes.document, str]:
    """Parse RST file using docutils with proper working directory and include handling"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            rst_content = f.read()
        
        # Method 1: Try parsing in the file's directory context
        original_cwd = Path.cwd()
        file_dir = file_path.parent
        
        try:
            # Change to file's directory so relative includes can be resolved
            import os
            os.chdir(file_dir)
            
            # Use the newer API (fixing deprecation warnings)
            from docutils.frontend import get_default_settings
            
            settings = get_default_settings(Parser)
            # Configure settings to be more permissive
            settings.report_level = 4  # Only show errors, not warnings
            settings.halt_level = 5    # Don't halt on warnings/errors
            settings.warning_stream = None  # Suppress warning output
            
            # Use file path as source path for better error context
            source_path = str(file_path)
            document = new_document(source_path, settings=settings)
            
            parser = Parser()
            parser.parse(rst_content, document)
            
            print(f"✅ Successfully parsed {file_path.name} with includes")
            return document, rst_content
            
        except Exception as include_error:
            print(f"⚠️ Include resolution failed for {file_path.name}: {include_error}")
            
            # Method 2: Try with includes disabled
            try:
                rst_content_no_includes = remove_include_directives(rst_content)
                
                settings = get_default_settings(Parser)
                settings.report_level = 5  # Suppress all warnings
                settings.halt_level = 5    # Don't halt on errors
                settings.warning_stream = None
                
                document = new_document(str(file_path), settings=settings)
                parser = Parser()
                parser.parse(rst_content_no_includes, document)
                
                print(f"✅ Successfully parsed {file_path.name} with includes disabled")
                return document, rst_content  # Return original content, not modified
                
            except Exception as parse_error:
                print(f"⚠️ Full parsing failed for {file_path.name}: {parse_error}")
                # Return minimal document for manual parsing fallback
                settings = get_default_settings(Parser)
                document = new_document(str(file_path), settings=settings)
                return document, rst_content
                
        finally:
            # Always restore original working directory
            os.chdir(original_cwd)
        
    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")
        raise


def filter_raw_html(content: str) -> str:
    """
    Remove raw HTML blocks and directives that are meant for web view.
    These are not useful for chunking and RAG purposes.
    """
    lines = content.split('\n')
    processed_lines = []
    i = 0
    
    while i < len(lines):
        line = lines[i]
        line_stripped = line.strip()
        
        # Check for raw HTML directive
        if line_stripped.startswith('.. raw:: html'):
            # Skip the directive line
            i += 1
            
            # Skip any options (lines starting with spaces and colons)
            while i < len(lines) and lines[i].startswith('   :'):
                i += 1
            
            # Skip empty line after options
            if i < len(lines) and lines[i].strip() == '':
                i += 1
            
            # Skip the HTML content (indented lines)
            while i < len(lines):
                if lines[i].strip() == '':
                    i += 1
                    continue
                    
                # Check if line is indented (part of the raw HTML block)
                if lines[i].startswith('   ') and lines[i].strip():
                    i += 1
                    continue
                else:
                    # End of HTML block
                    break
            
            # Add a comment indicating HTML was removed
            processed_lines.append(".. # Raw HTML content removed for chunking")
            continue
            
        # Check for HTML tags in regular content
        elif '<' in line and '>' in line:
            # Basic HTML tag detection and removal
            import re
            # Remove common HTML tags but keep the text content
            cleaned_line = re.sub(r'<[^>]+>', '', line)
            if cleaned_line.strip():
                processed_lines.append(cleaned_line)
            else:
                processed_lines.append(line)  # Keep original if cleaning removed everything
            i += 1
        else:
            processed_lines.append(line)
            i += 1
    
    return '\n'.join(processed_lines)
def process_content_for_images(content: str) -> str:
    """
    Process content to replace image/figure references with descriptive text
    and remove raw HTML content.
    """
    # First remove raw HTML content
    content = filter_raw_html(content)
    
    lines = content.split('\n')
    processed_lines = []
    i = 0
    
    while i < len(lines):
        line = lines[i]
        
        # Check for image directive
        if line.strip().startswith('.. image::'):
            # Extract image path
            image_path = line.split('::', 1)[1].strip()
            
            # Look for alt text in following lines
            alt_text = None
            j = i + 1
            while j < len(lines) and (lines[j].startswith('   ') or lines[j].strip() == ''):
                if lines[j].strip().startswith(':alt:'):
                    alt_text = lines[j].split(':alt:', 1)[1].strip()
                    break
                elif lines[j].strip().startswith(':alt '):
                    alt_text = lines[j].split(':alt ', 1)[1].strip()
                    break
                j += 1
            
            # Replace with descriptive text
            if alt_text:
                processed_lines.append(f"Image: {alt_text}")
            else:
                # Use filename if no alt text
                filename = Path(image_path).stem.replace('-', ' ').replace('_', ' ')
                processed_lines.append(f"Image: {filename}")
            
            # Skip the image directive and its options
            i = j if alt_text else i + 1
            continue
            
        # Check for figure directive
        elif line.strip().startswith('.. figure::'):
            # Extract figure path 
            figure_path = line.split('::', 1)[1].strip()
            
            # Look for alt text and caption in following lines
            alt_text = None
            caption = None
            j = i + 1
            
            # Skip options (lines starting with spaces and colons)
            while j < len(lines) and lines[j].startswith('   :'):
                if lines[j].strip().startswith(':alt:'):
                    alt_text = lines[j].split(':alt:', 1)[1].strip()
                j += 1
            
            # Caption is the next non-empty, indented line after options
            if j < len(lines) and lines[j].startswith('   ') and lines[j].strip():
                caption = lines[j].strip()
            
            # Replace with descriptive text
            if caption:
                processed_lines.append(f"Figure: {caption}")
            elif alt_text:
                processed_lines.append(f"Figure: {alt_text}")
            else:
                # Use filename if no caption or alt text
                filename = Path(figure_path).stem.replace('-', ' ').replace('_', ' ')
                processed_lines.append(f"Figure: {filename}")
            
            # Skip the figure directive, options, and caption
            i = j + 1 if caption else j
            continue
        else:
            processed_lines.append(line)
            i += 1
    
    return '\n'.join(processed_lines)
def extract_references_from_source(rst_content: str) -> List[RSTReference]:
    """
    Extract references directly from source text instead of relying on AST.
    This is more reliable and doesn't depend on successful parsing.
    """
    references = []
    lines = rst_content.split('\n')
    
    for line_num, line in enumerate(lines, 1):
        line_content = line.strip()
        
        # Detect RST directives that reference external files
        if line_content.startswith('.. '):
            for ref_type in ['include', 'literalinclude', 'csv-table']:
                if f'.. {ref_type}::' in line_content:
                    # Extract target from the directive
                    parts = line_content.split('::', 1)
                    if len(parts) > 1:
                        target = parts[1].strip()
                        references.append(RSTReference(
                            reference_type=ref_type,
                            target=target,
                            line_number=line_num,
                            directive=line_content
                        ))
                    break
    
    return references


def extract_references(document: nodes.document, source_lines: List[str]) -> List[RSTReference]:
    """Extract references/includes from RST document (but not images/figures)"""
    # First try AST-based extraction
    references = []
    
    try:
        # Use the newer findall method instead of deprecated traverse
        for node in document.findall():
            line_num = getattr(node, 'line', None)
            
            # Check for various reference types in the source (excluding images/figures)
            if line_num and line_num <= len(source_lines):
                line_content = source_lines[line_num - 1].strip()
                
                # Detect common RST directives that reference external files
                # but exclude image and figure directives
                if line_content.startswith('.. '):
                    for ref_type in ['include', 'literalinclude', 'csv-table']:
                        if f'.. {ref_type}::' in line_content:
                            # Extract target from the directive
                            parts = line_content.split('::', 1)
                            if len(parts) > 1:
                                target = parts[1].strip()
                                references.append(RSTReference(
                                    reference_type=ref_type,
                                    target=target,
                                    line_number=line_num,
                                    directive=line_content
                                ))
                            break
    except Exception:
        # Fallback to source-based extraction if AST approach fails
        pass
    
    # If AST extraction failed or found nothing, use source-based extraction
    if not references:
        references = extract_references_from_source('\n'.join(source_lines))
    
    return references


def get_section_level(node: nodes.section, title_levels: Dict[str, int]) -> int:
    """Determine section level based on title decoration"""
    title = node[0]  # First child should be title
    if isinstance(title, nodes.title):
        # Get the raw text and try to find its decoration in source
        return len(list(node.traverse(nodes.section, include_self=False)))
    return 0


def extract_semantic_chunks(document: nodes.document, rst_content: str) -> List[Dict[str, Any]]:
    """Extract semantic chunks from RST document"""
    chunks = []
    source_lines = rst_content.split('\n')
    
    # Extract document title if present
    if document.children and isinstance(document.children[0], nodes.title):
        title_node = document.children[0]
        title_line = getattr(title_node, 'line', 1)
        title_end = title_line + 2  # Usually title + decoration
        
        title_content = '\n'.join(source_lines[:title_end])
        title_content = process_content_for_images(title_content)
        
        chunks.append({
            'type': 'document_title',
            'name': str(title_node.astext()),
            'start_line': 1,
            'end_line': title_end,
            'content': title_content,
            'level': 0,
            'depth': 0
        })
    
    # Process sections hierarchically
    def process_section(section: nodes.section, parent_depth: int = 0):
        if not isinstance(section, nodes.section):
            return
            
        title = section[0] if section.children and isinstance(section[0], nodes.title) else None
        if not title:
            return
            
        section_start = getattr(title, 'line', 1)
        section_name = title.astext()
        
        # Find section end by looking for next sibling or parent end
        section_end = len(source_lines)
        for sibling in section.parent.children[section.parent.children.index(section) + 1:]:
            if hasattr(sibling, 'line') and sibling.line:
                section_end = sibling.line - 1
                break
        
        # Count subsections to determine actual end
        subsections = list(section.traverse(nodes.section, include_self=False))
        if subsections:
            # Section ends where first subsection starts
            first_subsection = subsections[0]
            if hasattr(first_subsection[0], 'line'):
                section_content_end = first_subsection[0].line - 1
            else:
                section_content_end = section_end
        else:
            section_content_end = section_end
        
        # Extract section content (without subsections)
        section_content_lines = []
        current_line = section_start - 1
        
        # Add title and content until first subsection
        while current_line < min(section_content_end, len(source_lines)):
            section_content_lines.append(source_lines[current_line])
            current_line += 1
        
        section_content = '\n'.join(section_content_lines)
        
        # Process content to replace images with descriptions
        section_content = process_content_for_images(section_content)
        
        if section_content.strip():
            chunks.append({
                'type': 'section',
                'name': section_name,
                'start_line': section_start,
                'end_line': section_content_end,
                'content': section_content,
                'level': parent_depth + 1,
                'depth': parent_depth
            })
        
        # Process subsections
        for subsection in section.traverse(nodes.section, include_self=False):
            if subsection.parent == section:  # Direct child only
                process_section(subsection, parent_depth + 1)
    
    # Process all top-level sections
    for section in document.traverse(nodes.section):
        if section.parent == document:  # Top-level sections only
            process_section(section)
    
            # Note: Code blocks are NOT extracted separately - they stay within their sections
        # This ensures code remains intact within the section context
        
        # Extract only directives (notes, warnings, etc.) that are standalone
        for node in document.traverse():
            line_num = getattr(node, 'line', None)
            if not line_num:
                continue
                
            # Directives (notes, warnings, etc.) - only if not within a section
            if isinstance(node, nodes.Admonition):
                # Check if this admonition is within a section
                parent_section = None
                for ancestor in node.traverse(include_self=False, descend=False):
                    if isinstance(ancestor, nodes.section):
                        parent_section = ancestor
                        break
                
                # Only create separate chunk if not within a section
                if not parent_section:
                    admonition_text = node.astext()
                    admonition_type = node.tagname if hasattr(node, 'tagname') else 'admonition'
                    chunks.append({
                        'type': f'{admonition_type}_directive',
                        'name': f'{admonition_type}_line_{line_num}',
                        'start_line': line_num,
                        'end_line': line_num + admonition_text.count('\n'),
                        'content': admonition_text,
                        'level': 8,
                        'depth': 0
                    })
    
    return chunks


# =============================================================================
# CHUNK CREATION
# =============================================================================

def create_rst_chunks(semantic_nodes: List[Dict[str, Any]]) -> List[RSTChunk]:
    """Create RSTChunk objects from semantic nodes"""
    chunks = []
    
    for node_info in semantic_nodes:
        chunk = RSTChunk(
            start_line=node_info['start_line'],
            end_line=node_info['end_line'],
            content=node_info['content'],
            node_type=node_info['type'],
            name=node_info['name'],
            depth=node_info.get('depth', 0),
            level=node_info.get('level', 0)
        )
        chunks.append(chunk)
    
    return chunks


def group_small_chunks(chunks: List[RSTChunk], target_tokens: int = TARGET_TOKENS) -> List[RSTChunk]:
    """Group small chunks together aggressively to reach reasonable size"""
    if not chunks:
        return chunks
    
    print(f"🔄 Grouping {len(chunks)} chunks (target: {target_tokens} tokens)")
    
    # Check if everything together is under the max limit
    total_tokens = sum(c.token_count for c in chunks)
    if total_tokens <= MAX_CHUNK_TOKENS:
        # Combine everything into one chunk
        combined_content = '\n\n'.join(c.content for c in chunks)
        combined_chunk = RSTChunk(
            start_line=chunks[0].start_line,
            end_line=chunks[-1].end_line,
            content=combined_content,
            node_type='complete_document',
            name=f"complete_document_{len(chunks)}_parts",
            depth=0,
            level=0
        )
        print(f"✅ Combined all {len(chunks)} chunks into 1 complete document ({total_tokens} tokens)")
        return [combined_chunk]
    
    # Group chunks more aggressively - aim for larger chunks
    grouped_chunks = []
    current_group = []
    current_tokens = 0
    
    # Sort chunks by level to group similar hierarchy levels together
    sorted_chunks = sorted(chunks, key=lambda c: (c.level, c.start_line))
    
    for chunk in sorted_chunks:
        # More aggressive grouping - use higher threshold
        can_add = (current_tokens + chunk.token_count <= MAX_CHUNK_TOKENS)
        should_group = (current_tokens + chunk.token_count <= target_tokens * 1.5)  # 1.5x target
        
        if can_add and (not current_group or should_group):
            current_group.append(chunk)
            current_tokens += chunk.token_count
        else:
            # Finalize current group if it has content
            if current_group:
                if len(current_group) == 1:
                    grouped_chunks.append(current_group[0])
                else:
                    # Create grouped chunk
                    group_content = '\n\n'.join(c.content for c in current_group)
                    
                    # Better naming based on content types
                    group_types = list(set(c.node_type for c in current_group))
                    if len(group_types) == 1:
                        group_name = f"{group_types[0]}_group_{len(current_group)}_parts"
                    else:
                        group_name = f"mixed_content_{len(current_group)}_parts"
                    
                    # Use the earliest chunk's position info
                    earliest_chunk = min(current_group, key=lambda c: c.start_line)
                    latest_chunk = max(current_group, key=lambda c: c.end_line)
                    
                    grouped_chunk = RSTChunk(
                        start_line=earliest_chunk.start_line,
                        end_line=latest_chunk.end_line,
                        content=group_content,
                        node_type='grouped_content',
                        name=group_name,
                        depth=min(c.depth for c in current_group),  # Use minimum depth
                        level=min(c.level for c in current_group)   # Use minimum level
                    )
                    grouped_chunks.append(grouped_chunk)
                
                print(f"  📦 Grouped {len(current_group)} chunks → {current_tokens} tokens")
            
            # Start new group with current chunk
            current_group = [chunk]
            current_tokens = chunk.token_count
    
    # Add final group
    if current_group:
        if len(current_group) == 1:
            grouped_chunks.append(current_group[0])
        else:
            group_content = '\n\n'.join(c.content for c in current_group)
            
            # Better naming
            group_types = list(set(c.node_type for c in current_group))
            if len(group_types) == 1:
                group_name = f"{group_types[0]}_group_{len(current_group)}_parts"
            else:
                group_name = f"mixed_content_{len(current_group)}_parts"
            
            earliest_chunk = min(current_group, key=lambda c: c.start_line)
            latest_chunk = max(current_group, key=lambda c: c.end_line)
            
            grouped_chunk = RSTChunk(
                start_line=earliest_chunk.start_line,
                end_line=latest_chunk.end_line,
                content=group_content,
                node_type='grouped_content',
                name=group_name,
                depth=min(c.depth for c in current_group),
                level=min(c.level for c in current_group)
            )
            grouped_chunks.append(grouped_chunk)
            
        print(f"  📦 Final group: {len(current_group)} chunks → {current_tokens} tokens")
    
    print(f"✅ Grouping result: {len(chunks)} → {len(grouped_chunks)} chunks")
    
    # Show final chunk sizes
    for i, chunk in enumerate(grouped_chunks):
        print(f"    Chunk {i+1}: {chunk.token_count} tokens ({chunk.name})")
    
    return grouped_chunks


def analyze_code_blocks_in_content(content: str) -> List[Dict[str, int]]:
    """
    Analyze code blocks in content to identify their boundaries.
    Returns list of code block locations with start/end line numbers.
    """
    lines = content.split('\n')
    code_blocks = []
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # Check for code-block directive
        if line.startswith('.. code-block::') or line.startswith('.. literalinclude::'):
            start_line = i
            i += 1
            
            # Skip options (lines starting with spaces and colons)
            while i < len(lines) and lines[i].startswith('   :'):
                i += 1
            
            # Skip empty line after options
            if i < len(lines) and lines[i].strip() == '':
                i += 1
            
            # Find end of code block (when indentation decreases)
            block_indent = None
            end_line = i
            
            while i < len(lines):
                if lines[i].strip() == '':
                    i += 1
                    continue
                    
                # Check indentation
                current_indent = len(lines[i]) - len(lines[i].lstrip())
                
                if block_indent is None:
                    if current_indent > 0:
                        block_indent = current_indent
                        i += 1
                        continue
                    else:
                        break
                else:
                    if current_indent < block_indent and lines[i].strip():
                        break
                    i += 1
            
            end_line = i - 1
            code_blocks.append({
                'start': start_line,
                'end': end_line,
                'type': 'directive'
            })
            continue
            
        # Check for literal blocks (double colon)
        elif line.endswith('::') and not line.startswith('.. '):
            start_line = i
            i += 1
            
            # Skip empty line after ::
            if i < len(lines) and lines[i].strip() == '':
                i += 1
            
            # Find end of literal block
            block_indent = None
            end_line = i
            
            while i < len(lines):
                if lines[i].strip() == '':
                    i += 1
                    continue
                    
                current_indent = len(lines[i]) - len(lines[i].lstrip())
                
                if block_indent is None:
                    if current_indent > 0:
                        block_indent = current_indent
                        i += 1
                        continue
                    else:
                        break
                else:
                    if current_indent < block_indent and lines[i].strip():
                        break
                    i += 1
            
            end_line = i - 1
            code_blocks.append({
                'start': start_line,
                'end': end_line,
                'type': 'literal'
            })
            continue
            
        i += 1
    
    return code_blocks


def split_content_preserving_code_blocks(content: str, max_tokens: int) -> List[str]:
    """
    Split content into chunks while preserving code block integrity.
    If a code block is too large, it gets its own chunk.
    """
    lines = content.split('\n')
    code_blocks = analyze_code_blocks_in_content(content)
    
    chunks = []
    current_chunk_lines = []
    current_tokens = 0
    line_idx = 0
    
    while line_idx < len(lines):
        # Check if current line is start of a code block
        current_code_block = None
        for cb in code_blocks:
            if cb['start'] == line_idx:
                current_code_block = cb
                break
        
        if current_code_block:
            # We're at the start of a code block
            code_block_lines = lines[current_code_block['start']:current_code_block['end'] + 1]
            code_block_content = '\n'.join(code_block_lines)
            code_block_tokens = count_tokens(code_block_content)
            
            # If code block + current chunk would exceed limit, finalize current chunk
            if current_tokens + code_block_tokens > max_tokens and current_chunk_lines:
                chunks.append('\n'.join(current_chunk_lines))
                current_chunk_lines = []
                current_tokens = 0
            
            # If code block itself is too large, give it its own chunk
            if code_block_tokens > max_tokens:
                # Finalize current chunk if it has content
                if current_chunk_lines:
                    chunks.append('\n'.join(current_chunk_lines))
                    current_chunk_lines = []
                    current_tokens = 0
                
                # Code block gets its own chunk
                chunks.append(code_block_content)
            else:
                # Add code block to current chunk
                current_chunk_lines.extend(code_block_lines)
                current_tokens += code_block_tokens
            
            # Move past the code block
            line_idx = current_code_block['end'] + 1
        else:
            # Regular line - add if it fits
            line = lines[line_idx]
            line_tokens = count_tokens(line)
            
            if current_tokens + line_tokens > max_tokens and current_chunk_lines:
                # Finalize current chunk
                chunks.append('\n'.join(current_chunk_lines))
                current_chunk_lines = [line]
                current_tokens = line_tokens
            else:
                current_chunk_lines.append(line)
                current_tokens += line_tokens
            
            line_idx += 1
    
    # Add final chunk
    if current_chunk_lines:
        chunks.append('\n'.join(current_chunk_lines))
    
    return chunks
def sub_chunk_by_lines(chunk: RSTChunk, rst_content: str) -> List[RSTChunk]:
    """Sub-chunk oversized chunks while preserving code block integrity"""
    if chunk.token_count <= MAX_CHUNK_TOKENS:
        return [chunk]
    
    # Extract the chunk's content from the full document
    lines = rst_content.split('\n')
    chunk_lines = lines[chunk.start_line-1:chunk.end_line]
    chunk_content = '\n'.join(chunk_lines)
    
    # Split content preserving code blocks
    sub_contents = split_content_preserving_code_blocks(chunk_content, MAX_CHUNK_TOKENS)
    
    if len(sub_contents) <= 1:
        return [chunk]  # Couldn't split effectively
    
    sub_chunks = []
    lines_processed = 0
    
    for i, sub_content in enumerate(sub_contents):
        sub_lines = sub_content.split('\n')
        sub_start_line = chunk.start_line + lines_processed
        sub_end_line = sub_start_line + len(sub_lines) - 1
        
        sub_chunk = RSTChunk(
            start_line=sub_start_line,
            end_line=sub_end_line,
            content=sub_content,
            node_type=f"{chunk.node_type}_part",
            name=f"{chunk.name}_part_{i+1}",
            depth=chunk.depth + 1,
            level=chunk.level
        )
        sub_chunks.append(sub_chunk)
        
        lines_processed += len(sub_lines)
    
    return sub_chunks


# =============================================================================
# FILE PROCESSING
# =============================================================================

def process_rst_file(file_path: Path) -> List[RSTChunk]:
    """Process a single RST file and return chunks"""
    print(f"\n🔍 Processing: {file_path.name}")
    
    try:
        # Parse RST file
        document, rst_content = parse_rst_file(file_path)
        
        if document.children:
            print(f"✅ Successfully parsed RST structure")
        else:
            print("⚠️ Empty document")
            return []
        
        # Extract references (similar to imports)
        references = extract_references(document, rst_content.split('\n'))
        if references:
            print(f"📎 Found {len(references)} references/includes")
            for ref in references:
                print(f"  - {ref.reference_type}: {ref.target}")
        
        # Find semantic chunks
        semantic_nodes = extract_semantic_chunks(document, rst_content)
        print(f"Found {len(semantic_nodes)} semantic units")
        
        # Show what we found
        for node in semantic_nodes:
            preview = node['content'][:100].replace('\n', ' ').strip()
            print(f"  - {node['type']}: {node['name']} ({count_tokens(node['content'])} tokens)")
            print(f"    Preview: {preview}...")
        
        # Create chunks
        base_chunks = create_rst_chunks(semantic_nodes)
        
        # Group small chunks
        base_chunks = group_small_chunks(base_chunks, target_tokens=TARGET_TOKENS)
        print(f"Created {len(base_chunks)} semantic chunks")
        
        # Apply sub-chunking for oversized chunks
        final_chunks = []
        oversized_count = 0
        
        for chunk in base_chunks:
            if chunk.token_count > MAX_CHUNK_TOKENS:
                print(f"  Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
                sub_chunks = sub_chunk_by_lines(chunk, rst_content)
                final_chunks.extend(sub_chunks)
                oversized_count += 1
            else:
                final_chunks.append(chunk)
        
        if oversized_count > 0:
            print(f"Sub-chunked {oversized_count} oversized chunks")
        print(f"Final result: {len(final_chunks)} total chunks")
        
        return final_chunks
        
    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")
        return []


# =============================================================================
# OUTPUT GENERATION
# =============================================================================

def generate_unique_id(length: int = 6) -> str:
    """Generate a random unique ID"""
    alphabet = string.ascii_lowercase + string.digits
    return ''.join(secrets.choice(alphabet) for _ in range(length))


def create_chunk_filename(original_filename: str, chunk_number: int, unique_id: str) -> str:
    """Create chunk filename: index.rst_chunk_001_a1s2d3.md"""
    return f"{original_filename}_chunk_{chunk_number:03d}_{unique_id}.md"


def create_chunk_markdown(chunk: RSTChunk, source_file_path: str, references: List[RSTReference]) -> str:
    """Create markdown content with YAML frontmatter"""
    unique_id = generate_unique_id()
    
    # Filter references that might apply to this chunk
    chunk_references = []
    for ref in references:
        if chunk.start_line <= ref.line_number <= chunk.end_line:
            chunk_references.append(f"{ref.reference_type}: {ref.target}")
    
    frontmatter = f"""---
file_path: "{source_file_path}"
chunk_id: "{unique_id}"
chunk_type: "{chunk.node_type}"
chunk_name: "{chunk.name}"
start_line: {chunk.start_line}
end_line: {chunk.end_line}
token_count: {chunk.token_count}
depth: {chunk.depth}
level: {chunk.level}
language: "rst"
references: {chunk_references}
---

# {chunk.name}

**Type:** {chunk.node_type}  
**Tokens:** {chunk.token_count}  
**Depth:** {chunk.depth}  
**Level:** {chunk.level}

```rst
{chunk.content}
```
"""
    return frontmatter


def save_chunks_to_files(chunks: List[RSTChunk], 
                        original_file_path: Path, 
                        input_directory: Path,
                        output_base: Path,
                        references: List[RSTReference]) -> List[str]:
    """Save chunks as markdown files maintaining directory structure"""
    if not chunks:
        return []
    
    # Calculate relative path from input directory
    try:
        rel_path = original_file_path.relative_to(input_directory)
    except ValueError:
        # If file is not under input directory, use just the filename
        rel_path = original_file_path.name
    
    # Create output directory structure
    output_dir = output_base / rel_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Generate unique ID for this file
    file_unique_id = generate_unique_id()
    
    saved_files = []
    
    # Add chunk index to each chunk and save
    for i, chunk in enumerate(chunks, 1):
        # Create chunk filename
        chunk_filename = create_chunk_filename(
            original_file_path.name, 
            i, 
            file_unique_id
        )
        
        # Create markdown content
        markdown_content = create_chunk_markdown(
            chunk, 
            str(rel_path), 
            references
        )
        
        # Write to file
        chunk_file_path = output_dir / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            saved_files.append(str(chunk_file_path))
            print(f"  ✅ Saved: {chunk_filename}")
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    return saved_files


def print_chunk_summary(chunks: List[RSTChunk], file_name: str):
    """Print detailed summary of chunks"""
    print(f"\n--- RST Chunk Summary for {file_name} ---")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"{indent}{i}. {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type} | Level: {chunk.level} | Lines: {content_lines} | Tokens: {chunk.token_count}")


# =============================================================================
# JUPYTER NOTEBOOK FUNCTIONS
# =============================================================================

def process_rst_content(rst_content: str, file_name: str = "content.rst") -> List[RSTChunk]:
    """
    Process RST content directly (for Jupyter notebooks).
    
    Args:
        rst_content: Raw RST content as string
        file_name: Name to use for the content (for display purposes)
    
    Returns:
        List of RSTChunk objects
    """
    try:
        # Use the approach you suggested: pre-process to remove includes
        rst_input = remove_include_directives(rst_content)
        
        # Parse with docutils using the pattern you mentioned
        from docutils.frontend import get_default_settings
        
        settings = get_default_settings(Parser)
        settings.report_level = 4  # Show errors but not warnings
        settings.halt_level = 5    # Don't halt on errors
        settings.warning_stream = None  # Suppress warnings
        
        document = new_document(file_name, settings=settings)
        parser = Parser()
        
        try:
            parser.parse(rst_input, document)
            print(f"✅ Successfully parsed RST structure for {file_name}")
        except Exception as parse_error:
            print(f"⚠️ Parse error in {file_name}: {parse_error}")
            # Continue with whatever was parsed
        
        if not document.children:
            print(f"⚠️ Empty document: {file_name}")
            return []
        
        # Extract references from original content (not the modified one)
        references = extract_references_from_source(rst_content)
        if references:
            print(f"📎 Found {len(references)} references/includes")
            for ref in references:
                print(f"  - {ref.reference_type}: {ref.target}")
        
        # Find semantic chunks
        semantic_nodes = extract_semantic_chunks(document, rst_content)
        print(f"Found {len(semantic_nodes)} semantic units")
        
        # Show what we found
        for node in semantic_nodes:
            preview = node['content'][:100].replace('\n', ' ').strip()
            print(f"  - {node['type']}: {node['name']} ({count_tokens(node['content'])} tokens)")
            print(f"    Preview: {preview}...")
        
        # Create chunks
        base_chunks = create_rst_chunks(semantic_nodes)
        
        # Group small chunks
        base_chunks = group_small_chunks(base_chunks, target_tokens=TARGET_TOKENS)
        print(f"Created {len(base_chunks)} semantic chunks")
        
        # Apply sub-chunking for oversized chunks
        final_chunks = []
        oversized_count = 0
        
        for chunk in base_chunks:
            if chunk.token_count > MAX_CHUNK_TOKENS:
                print(f"  Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
                sub_chunks = sub_chunk_by_lines(chunk, rst_content)
                final_chunks.extend(sub_chunks)
                oversized_count += 1
            else:
                final_chunks.append(chunk)
        
        if oversized_count > 0:
            print(f"Sub-chunked {oversized_count} oversized chunks")
        print(f"Final result: {len(final_chunks)} total chunks")
        
        return final_chunks
        
    except Exception as e:
        print(f"❌ Error processing RST content: {e}")
        return []


def display_chunks(chunks: List[RSTChunk]) -> None:
    """Display chunks in a notebook-friendly format"""
    print(f"\n{'='*60}")
    print(f"📄 RST CHUNKS SUMMARY ({len(chunks)} chunks)")
    print(f"{'='*60}")
    
    total_tokens = sum(chunk.token_count for chunk in chunks)
    print(f"📊 Total tokens: {total_tokens:,}")
    print(f"📊 Average tokens per chunk: {total_tokens/len(chunks):.1f}" if chunks else "No chunks")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"\n{indent}📝 Chunk {i}: {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type} | Level: {chunk.level} | Lines: {content_lines} | Tokens: {chunk.token_count}")
        
        # Show content preview
        preview = chunk.content[:200].replace('\n', ' ').strip()
        if len(chunk.content) > 200:
            preview += "..."
        print(f"{indent}   Preview: {preview}")


def save_chunks_as_markdown(chunks: List[RSTChunk], output_dir: str = "rst_chunks") -> None:
    """Save chunks as markdown files (notebook version)"""
    from pathlib import Path
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    file_unique_id = generate_unique_id()
    saved_count = 0
    
    print(f"\n💾 Saving {len(chunks)} chunks to {output_path}/")
    
    for i, chunk in enumerate(chunks, 1):
        # Create chunk filename
        chunk_filename = f"chunk_{i:03d}_{file_unique_id}.md"
        
        # Create markdown content
        markdown_content = create_chunk_markdown(chunk, "notebook_content.rst", [])
        
        # Write to file
        chunk_file_path = output_path / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            print(f"  ✅ Saved: {chunk_filename}")
            saved_count += 1
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    print(f"✅ Successfully saved {saved_count} chunk files")


# =============================================================================
# MAIN PROCESSING
# =============================================================================

def main():
    """Main function for RST semantic chunking"""
    print("🚀 RST (reStructuredText) Semantic Chunking")
    print(f"Max chunk tokens: {MAX_CHUNK_TOKENS}")
    print(f"Target tokens for grouping: {TARGET_TOKENS}")
    
    # Get directory from user or use current directory
    directory = input("\nEnter source directory path (or press Enter for current directory): ").strip()
    if not directory:
        directory = "."
    
    target_dir = Path(directory).resolve()
    if not target_dir.exists():
        print(f"❌ Directory not found: {directory}")
        return
    
    # Create output directory parallel to source directory
    output_dir = target_dir.parent / f"{target_dir.name}_rst_chunks"
    output_dir.mkdir(exist_ok=True)
    print(f"📁 Output directory: {output_dir}")
    
    target_path = target_dir
    input_directory = target_dir
    
    
    # Collect RST files
    rst_files = []
    for ext in ['*.rst', '*.txt']:
        rst_files.extend(target_path.rglob(ext))
    
    # Filter to actual RST files by checking content
    actual_rst_files = []
    for file in rst_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                content = f.read(1000)  # Check first 1000 chars
                # Simple heuristic: look for RST-like content
                if any(marker in content for marker in ['===', '---', '~~~', '^^^', '.. ', '::']):
                    actual_rst_files.append(file)
        except:
            continue
    
    rst_files = actual_rst_files
    
    if not rst_files:
        print(f"❌ No RST files found in {directory}")
        return
    
    print(f"🔍 Found {len(rst_files)} RST file(s)")
    
    # Group by directory for display
    by_dir = {}
    for f in rst_files:
        dir_path = str(f.parent.relative_to(input_directory)) if f.parent != input_directory else '.'
        by_dir[dir_path] = by_dir.get(dir_path, []) + [f.name]
    
    for dir_path, files in sorted(by_dir.items()):
        print(f"  📂 {dir_path}: {len(files)} files")
        for file_name in sorted(files)[:3]:  # Show first 3 files
            print(f"    📄 {file_name}")
        if len(files) > 3:
            print(f"    ... and {len(files) - 3} more")
    
    # Process all files automatically
    print(f"\n🔄 Processing all {len(rst_files)} file(s)...")
    all_chunks = {}
    all_references = {}
    
    for file_path in rst_files:
        try:
            chunks = process_rst_file(file_path)
            all_chunks[file_path] = chunks
            
            # Extract references for this file
            document, rst_content = parse_rst_file(file_path)
            references = extract_references(document, rst_content.split('\n'))
            all_references[file_path] = references
            
            print_chunk_summary(chunks, file_path.name)
            
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
            continue
    
    # Summary
    total_chunks = sum(len(chunks) for chunks in all_chunks.values())
    total_tokens = sum(chunk.token_count for chunks in all_chunks.values() for chunk in chunks)
    
    print(f"\n📊 Processing Summary:")
    print(f"   Files processed: {len(all_chunks)}")
    print(f"   Total chunks: {total_chunks}")
    print(f"   Total tokens: {total_tokens:,}")
    print(f"   Average tokens per chunk: {total_tokens/total_chunks:.1f}" if total_chunks > 0 else "   No chunks created")
    
    # Save chunks automatically with parallel directory structure
    print(f"\n💾 Saving chunks to markdown files...")
    saved_count = 0
    
    for file_path, chunks in all_chunks.items():
        references = all_references.get(file_path, [])
        saved_files = save_chunks_to_files(chunks, file_path, input_directory, output_dir, references)
        saved_count += len(saved_files)
    
    print(f"✅ Saved {saved_count} chunk files to {output_dir}")
    print(f"📁 Directory structure preserved in output")


if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python3
"""
RST (reStructuredText) AST Chunker

This module implements semantic chunking for RST files using docutils for parsing.
It follows the same patterns as the existing Python and TypeScript AST chunkers.
"""

import sys
import string
import secrets
import tiktoken
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
from docutils import core, nodes
from docutils.frontend import OptionParser
from docutils.utils import new_document
from docutils.parsers.rst import Parser


# =============================================================================
# CONFIGURATION AND CONSTANTS
# =============================================================================

MAX_CHUNK_TOKENS = 1000
TARGET_TOKENS = 600


# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class RSTChunk:
    """Represents a semantic chunk of RST content"""
    start_line: int
    end_line: int
    content: str
    node_type: str
    name: str
    depth: int
    level: int = 0  # For section hierarchy (0=title, 1=section, 2=subsection, etc.)
    token_count: int = 0
    
    def __post_init__(self):
        if self.token_count == 0:
            self.token_count = count_tokens(self.content)


@dataclass
class RSTReference:
    """Represents a reference/include in RST (similar to imports)"""
    reference_type: str  # 'include', 'image', 'figure', 'literalinclude', etc.
    target: str
    line_number: int
    directive: str


# =============================================================================
# TOKEN COUNTING
# =============================================================================

def count_tokens(content: str) -> int:
    """Count tokens using tiktoken for GPT-4"""
    try:
        encoding = tiktoken.encoding_for_model("gpt-4")
        return len(encoding.encode(content))
    except Exception:
        # Fallback: rough estimation (1 token ≈ 4 characters)
        return len(content) // 4


# =============================================================================
# RST PARSING AND ANALYSIS
# =============================================================================

def remove_include_directives(rst_content: str) -> str:
    """
    Remove or comment out include directives to prevent file resolution errors.
    This allows parsing to proceed without trying to resolve include files.
    """
    lines = rst_content.split('\n')
    processed_lines = []
    i = 0
    
    while i < len(lines):
        line = lines[i]
        
        # Check for include directive
        if line.strip().startswith('.. include::'):
            # Comment out the include directive
            processed_lines.append(f".. # INCLUDE DISABLED: {line.strip()}")
            i += 1
            
            # Also comment out any options that follow
            while i < len(lines) and lines[i].startswith('   :'):
                processed_lines.append(f".. # INCLUDE OPTION: {lines[i].strip()}")
                i += 1
        else:
            processed_lines.append(line)
            i += 1
    
    return '\n'.join(processed_lines)


def parse_rst_file(file_path: Path) -> Tuple[nodes.document, str]:
    """Parse RST file using docutils with proper working directory and include handling"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            rst_content = f.read()
        
        # Method 1: Try parsing in the file's directory context
        original_cwd = Path.cwd()
        file_dir = file_path.parent
        
        try:
            # Change to file's directory so relative includes can be resolved
            import os
            os.chdir(file_dir)
            
            # Use the newer API (fixing deprecation warnings)
            from docutils.frontend import get_default_settings
            
            settings = get_default_settings(Parser)
            # Configure settings to be more permissive
            settings.report_level = 4  # Only show errors, not warnings
            settings.halt_level = 5    # Don't halt on warnings/errors
            settings.warning_stream = None  # Suppress warning output
            
            # Use file path as source path for better error context
            source_path = str(file_path)
            document = new_document(source_path, settings=settings)
            
            parser = Parser()
            parser.parse(rst_content, document)
            
            print(f"✅ Successfully parsed {file_path.name} with includes")
            return document, rst_content
            
        except Exception as include_error:
            print(f"⚠️ Include resolution failed for {file_path.name}: {include_error}")
            
            # Method 2: Try with includes disabled
            try:
                rst_content_no_includes = remove_include_directives(rst_content)
                
                settings = get_default_settings(Parser)
                settings.report_level = 5  # Suppress all warnings
                settings.halt_level = 5    # Don't halt on errors
                settings.warning_stream = None
                
                document = new_document(str(file_path), settings=settings)
                parser = Parser()
                parser.parse(rst_content_no_includes, document)
                
                print(f"✅ Successfully parsed {file_path.name} with includes disabled")
                return document, rst_content  # Return original content, not modified
                
            except Exception as parse_error:
                print(f"⚠️ Full parsing failed for {file_path.name}: {parse_error}")
                # Return minimal document for manual parsing fallback
                settings = get_default_settings(Parser)
                document = new_document(str(file_path), settings=settings)
                return document, rst_content
                
        finally:
            # Always restore original working directory
            os.chdir(original_cwd)
        
    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")
        raise


def filter_raw_html(content: str) -> str:
    """
    Remove raw HTML blocks and directives that are meant for web view.
    These are not useful for chunking and RAG purposes.
    """
    lines = content.split('\n')
    filtered_lines = []
    skip_lines = 0
    
    for i, line in enumerate(lines):
        if skip_lines > 0:
            skip_lines -= 1
            continue
            
        # Skip raw HTML directives
        if line.strip().startswith('.. raw:: html'):
            # Skip this line and look for the content block
            j = i + 1
            while j < len(lines) and (lines[j].startswith('   ') or lines[j].strip() == ''):
                j += 1
            skip_lines = j - i - 1
            continue
        
        filtered_lines.append(line)
    
    return '\n'.join(filtered_lines)


def extract_references(document: nodes.document, source_lines: List[str]) -> List[RSTReference]:
    """Extract references/includes from RST document"""
    references = []
    
    try:
        # FIXED: Replace traverse() with findall()
        directive_nodes = list(document.findall(condition=lambda node: 
            hasattr(node, 'tagname') and node.tagname in ['image', 'figure']))
        
        for node in directive_nodes:
            line_num = getattr(node, 'line', None)
            if line_num and hasattr(node, 'attributes'):
                uri = node.attributes.get('uri', '')
                if uri:
                    references.append(RSTReference(
                        reference_type=node.tagname,
                        target=uri,
                        line_number=line_num,
                        directive=f".. {node.tagname}:: {uri}"
                    ))
        
        # Also extract from source lines for include directives and others
        for line_num, line_content in enumerate(source_lines, 1):
            if line_content.strip().startswith('..'):
                for ref_type in ['include', 'literalinclude', 'csv-table']:
                    if f'.. {ref_type}::' in line_content:
                        # Extract target from the directive
                        parts = line_content.split('::', 1)
                        if len(parts) > 1:
                            target = parts[1].strip()
                            references.append(RSTReference(
                                reference_type=ref_type,
                                target=target,
                                line_number=line_num,
                                directive=line_content
                            ))
                        break
    except Exception:
        # Fallback to source-based extraction if AST approach fails
        pass
    
    # If AST extraction failed or found nothing, use source-based extraction
    if not references:
        references = extract_references_from_source('\n'.join(source_lines))
    
    return references


def extract_references_from_source(rst_content: str) -> List[RSTReference]:
    """Extract references from source text as fallback"""
    references = []
    lines = rst_content.split('\n')
    
    for line_num, line in enumerate(lines, 1):
        line = line.strip()
        if line.startswith('..'):
            for ref_type in ['include', 'literalinclude', 'image', 'figure', 'csv-table']:
                if f'.. {ref_type}::' in line:
                    parts = line.split('::', 1)
                    if len(parts) > 1:
                        target = parts[1].strip()
                        references.append(RSTReference(
                            reference_type=ref_type,
                            target=target,
                            line_number=line_num,
                            directive=line
                        ))
                    break
    
    return references


def get_section_level(node: nodes.section, title_levels: Dict[str, int]) -> int:
    """Determine section level based on title decoration"""
    title = node[0]  # First child should be title
    if isinstance(title, nodes.title):
        # FIXED: Replace traverse() with findall()
        nested_sections = list(node.findall(nodes.section))
        # Remove self from count (findall includes the node itself)
        return len([s for s in nested_sections if s != node])
    return 0


def extract_semantic_chunks(document: nodes.document, rst_content: str) -> List[Dict[str, Any]]:
    """Extract semantic chunks from RST document"""
    chunks = []
    source_lines = rst_content.split('\n')
    
    # Extract document title if present
    if document.children and isinstance(document.children[0], nodes.title):
        title_node = document.children[0]
        title_line = getattr(title_node, 'line', 1)
        chunks.append({
            'type': 'document_title',
            'name': title_node.astext(),
            'start_line': title_line,
            'end_line': title_line,
            'content': title_node.astext(),
            'level': 0,
            'depth': 0
        })
    
    # Extract sections and their content
    # FIXED: Replace traverse() with findall()
    sections = list(document.findall(nodes.section))
    
    for section in sections:
        title = section[0] if section.children and isinstance(section[0], nodes.title) else None
        if not title:
            continue
            
        section_name = title.astext()
        section_line = getattr(title, 'line', 1)
        
        # Determine section level
        level = get_section_level(section, {})
        
        # Get all subsections within this section
        # FIXED: Replace traverse() with findall()
        subsections = [s for s in section.findall(nodes.section) if s != section]
        
        # Calculate section content (excluding subsections)
        section_content_lines = []
        section_start_line = section_line
        section_end_line = section_line
        
        # Find the end of this section's content
        if subsections:
            # If there are subsections, content ends before first subsection
            first_subsection_line = min(getattr(sub[0], 'line', len(source_lines)) 
                                      for sub in subsections 
                                      if sub.children and isinstance(sub[0], nodes.title))
            section_end_line = first_subsection_line - 1
        else:
            # No subsections, include all content
            # FIXED: Replace traverse() with findall()
            all_nodes_in_section = list(section.findall())
            if all_nodes_in_section:
                max_line = max(getattr(node, 'line', section_line) 
                             for node in all_nodes_in_section if hasattr(node, 'line'))
                section_end_line = max_line
        
        # Extract content for this section
        if section_start_line <= len(source_lines):
            end_idx = min(section_end_line, len(source_lines))
            section_content = '\n'.join(source_lines[section_start_line-1:end_idx])
        else:
            section_content = section.astext()
        
        chunks.append({
            'type': 'section',
            'name': section_name,
            'start_line': section_start_line,
            'end_line': section_end_line,
            'content': section_content,
            'level': level + 1,  # +1 because document title is level 0
            'depth': level
        })
    
    # Extract standalone elements that are not within sections
    # FIXED: Replace traverse() with findall()
    all_nodes = list(document.findall())
    
    for node in all_nodes:
        line_num = getattr(node, 'line', None)
        if not line_num:
            continue
            
        # Directives (notes, warnings, etc.) - only if not within a section
        if isinstance(node, nodes.Admonition):
            # Check if this admonition is within a section
            parent_section = None
            # FIXED: Replace traverse() with parent traversal
            current = node.parent
            while current:
                if isinstance(current, nodes.section):
                    parent_section = current
                    break
                current = current.parent
            
            # Only create separate chunk if not within a section
            if not parent_section:
                admonition_text = node.astext()
                admonition_type = node.tagname if hasattr(node, 'tagname') else 'admonition'
                chunks.append({
                    'type': f'{admonition_type}_directive',
                    'name': f'{admonition_type}_line_{line_num}',
                    'start_line': line_num,
                    'end_line': line_num + admonition_text.count('\n'),
                    'content': admonition_text,
                    'level': 8,
                    'depth': 0
                })
    
    return chunks


# =============================================================================
# CHUNK CREATION
# =============================================================================

def create_rst_chunks(semantic_nodes: List[Dict[str, Any]]) -> List[RSTChunk]:
    """Create RSTChunk objects from semantic nodes"""
    chunks = []
    
    for node_info in semantic_nodes:
        chunk = RSTChunk(
            start_line=node_info['start_line'],
            end_line=node_info['end_line'],
            content=node_info['content'],
            node_type=node_info['type'],
            name=node_info['name'],
            depth=node_info.get('depth', 0),
            level=node_info.get('level', 0)
        )
        chunks.append(chunk)
    
    return chunks


def group_small_chunks(chunks: List[RSTChunk], target_tokens: int = 600) -> List[RSTChunk]:
    """Group small chunks together to reach reasonable size"""
    if not chunks:
        return chunks
    
    # Check if everything together is under limit
    total_tokens = sum(c.token_count for c in chunks)
    if total_tokens <= MAX_CHUNK_TOKENS:
        # Combine all into one chunk
        combined_content = "\n\n".join(c.content for c in chunks)
        return [RSTChunk(
            start_line=min(c.start_line for c in chunks),
            end_line=max(c.end_line for c in chunks),
            content=combined_content,
            node_type="combined_document",
            name="complete_document",
            depth=0,
            level=0
        )]
    
    # Group small chunks while respecting section hierarchy
    grouped_chunks = []
    current_group = []
    current_tokens = 0
    
    for chunk in chunks:
        if (current_tokens + chunk.token_count > target_tokens and 
            current_group and chunk.token_count <= MAX_CHUNK_TOKENS):
            
            # Create grouped chunk
            if len(current_group) == 1:
                grouped_chunks.append(current_group[0])
            else:
                combined_content = "\n\n".join(c.content for c in current_group)
                grouped_chunk = RSTChunk(
                    start_line=min(c.start_line for c in current_group),
                    end_line=max(c.end_line for c in current_group),
                    content=combined_content,
                    node_type="grouped_sections",
                    name=f"sections_{current_group[0].name}_to_{current_group[-1].name}",
                    depth=min(c.depth for c in current_group),
                    level=min(c.level for c in current_group)
                )
                grouped_chunks.append(grouped_chunk)
            
            current_group = [chunk]
            current_tokens = chunk.token_count
        else:
            current_group.append(chunk)
            current_tokens += chunk.token_count
    
    # Add remaining chunks
    if current_group:
        if len(current_group) == 1:
            grouped_chunks.append(current_group[0])
        else:
            combined_content = "\n\n".join(c.content for c in current_group)
            grouped_chunk = RSTChunk(
                start_line=min(c.start_line for c in current_group),
                end_line=max(c.end_line for c in current_group),
                content=combined_content,
                node_type="grouped_sections",
                name=f"sections_{current_group[0].name}_to_{current_group[-1].name}",
                depth=min(c.depth for c in current_group),
                level=min(c.level for c in current_group)
            )
            grouped_chunks.append(grouped_chunk)
    
    return grouped_chunks


def analyze_code_blocks_in_content(content: str) -> List[Dict[str, int]]:
    """Analyze code block positions in RST content"""
    lines = content.split('\n')
    code_blocks = []
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # Look for code block indicators
        if (line.startswith('.. code-block::') or 
            line.startswith('.. sourcecode::') or 
            line.endswith('::') and len(line) > 2):
            
            start_line = i
            i += 1
            
            # Skip empty lines and options
            while i < len(lines) and (lines[i].strip() == '' or lines[i].startswith('   :')):
                i += 1
            
            # Find end of indented block
            while i < len(lines) and lines[i].startswith('    '):
                i += 1
            
            code_blocks.append({
                'start': start_line,
                'end': i - 1
            })
        else:
            i += 1
    
    return code_blocks


def split_content_preserving_code_blocks(content: str, max_tokens: int) -> List[str]:
    """
    Split content into chunks while preserving code block integrity.
    If a code block is too large, it gets its own chunk.
    """
    lines = content.split('\n')
    code_blocks = analyze_code_blocks_in_content(content)
    
    chunks = []
    current_chunk_lines = []
    current_tokens = 0
    line_idx = 0
    
    while line_idx < len(lines):
        # Check if current line is start of a code block
        current_code_block = None
        for cb in code_blocks:
            if cb['start'] == line_idx:
                current_code_block = cb
                break
        
        if current_code_block:
            # We're at the start of a code block
            code_block_lines = lines[current_code_block['start']:current_code_block['end'] + 1]
            code_block_content = '\n'.join(code_block_lines)
            code_block_tokens = count_tokens(code_block_content)
            
            # If code block + current chunk would exceed limit, finalize current chunk
            if current_tokens + code_block_tokens > max_tokens and current_chunk_lines:
                chunks.append('\n'.join(current_chunk_lines))
                current_chunk_lines = []
                current_tokens = 0
            
            # If code block itself is too large, give it its own chunk
            if code_block_tokens > max_tokens:
                # Finalize current chunk if it has content
                if current_chunk_lines:
                    chunks.append('\n'.join(current_chunk_lines))
                    current_chunk_lines = []
                    current_tokens = 0
                
                # Code block gets its own chunk
                chunks.append(code_block_content)
            else:
                # Add code block to current chunk
                current_chunk_lines.extend(code_block_lines)
                current_tokens += code_block_tokens
            
            # Move past the code block
            line_idx = current_code_block['end'] + 1
        else:
            # Regular line - add if it fits
            line = lines[line_idx]
            line_tokens = count_tokens(line)
            
            if current_tokens + line_tokens > max_tokens and current_chunk_lines:
                # Finalize current chunk
                chunks.append('\n'.join(current_chunk_lines))
                current_chunk_lines = [line]
                current_tokens = line_tokens
            else:
                current_chunk_lines.append(line)
                current_tokens += line_tokens
            
            line_idx += 1
    
    # Add final chunk
    if current_chunk_lines:
        chunks.append('\n'.join(current_chunk_lines))
    
    return chunks


def sub_chunk_by_lines(chunk: RSTChunk, rst_content: str) -> List[RSTChunk]:
    """Sub-chunk oversized chunks while preserving code block integrity"""
    if chunk.token_count <= MAX_CHUNK_TOKENS:
        return [chunk]
    
    # Extract the chunk's content from the full document
    lines = rst_content.split('\n')
    chunk_lines = lines[chunk.start_line-1:chunk.end_line]
    chunk_content = '\n'.join(chunk_lines)
    
    # Split content preserving code blocks
    sub_contents = split_content_preserving_code_blocks(chunk_content, MAX_CHUNK_TOKENS)
    
    if len(sub_contents) <= 1:
        return [chunk]  # Couldn't split effectively
    
    sub_chunks = []
    lines_processed = 0
    
    for i, sub_content in enumerate(sub_contents):
        sub_lines = sub_content.split('\n')
        sub_start_line = chunk.start_line + lines_processed
        sub_end_line = sub_start_line + len(sub_lines) - 1
        
        sub_chunk = RSTChunk(
            start_line=sub_start_line,
            end_line=sub_end_line,
            content=sub_content,
            node_type=f"{chunk.node_type}_part",
            name=f"{chunk.name}_part_{i+1}",
            depth=chunk.depth + 1,
            level=chunk.level
        )
        sub_chunks.append(sub_chunk)
        
        lines_processed += len(sub_lines)
    
    return sub_chunks


# =============================================================================
# FILE PROCESSING
# =============================================================================

def process_rst_file(file_path: Path) -> List[RSTChunk]:
    """Process a single RST file and return chunks"""
    print(f"\n🔍 Processing: {file_path.name}")
    
    try:
        # Parse RST file
        document, rst_content = parse_rst_file(file_path)
        
        if document.children:
            print(f"✅ Successfully parsed RST structure")
        else:
            print("⚠️ Empty document")
            return []
        
        # Extract references (similar to imports)
        references = extract_references(document, rst_content.split('\n'))
        if references:
            print(f"📎 Found {len(references)} references/includes")
            for ref in references:
                print(f"  - {ref.reference_type}: {ref.target}")
        
        # Find semantic chunks
        semantic_nodes = extract_semantic_chunks(document, rst_content)
        print(f"Found {len(semantic_nodes)} semantic units")
        
        # Show what we found
        for node in semantic_nodes:
            preview = node['content'][:100].replace('\n', ' ').strip()
            print(f"  - {node['type']}: {node['name']} ({count_tokens(node['content'])} tokens)")
            print(f"    Preview: {preview}...")
        
        # Create chunks
        base_chunks = create_rst_chunks(semantic_nodes)
        
        # Group small chunks
        base_chunks = group_small_chunks(base_chunks, target_tokens=TARGET_TOKENS)
        print(f"Created {len(base_chunks)} semantic chunks")
        
        # Apply sub-chunking for oversized chunks
        final_chunks = []
        oversized_count = 0
        
        for chunk in base_chunks:
            if chunk.token_count > MAX_CHUNK_TOKENS:
                print(f"  Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
                sub_chunks = sub_chunk_by_lines(chunk, rst_content)
                final_chunks.extend(sub_chunks)
                oversized_count += 1
            else:
                final_chunks.append(chunk)
        
        if oversized_count > 0:
            print(f"Sub-chunked {oversized_count} oversized chunks")
        print(f"Final result: {len(final_chunks)} total chunks")
        
        return final_chunks
        
    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")
        return []


# =============================================================================
# OUTPUT GENERATION
# =============================================================================

def generate_unique_id(length: int = 6) -> str:
    """Generate a random unique ID"""
    alphabet = string.ascii_lowercase + string.digits
    return ''.join(secrets.choice(alphabet) for _ in range(length))


def create_chunk_filename(original_filename: str, chunk_number: int, unique_id: str) -> str:
    """Create chunk filename: index.rst_chunk_001_a1s2d3.md"""
    return f"{original_filename}_chunk_{chunk_number:03d}_{unique_id}.md"


def create_chunk_markdown(chunk: RSTChunk, source_file_path: str, references: List[RSTReference]) -> str:
    """Create markdown content with YAML frontmatter"""
    unique_id = generate_unique_id()
    
    # Filter references that might apply to this chunk
    chunk_references = []
    for ref in references:
        if chunk.start_line <= ref.line_number <= chunk.end_line:
            chunk_references.append(f"{ref.reference_type}: {ref.target}")
    
    frontmatter = f"""---
file_path: "{source_file_path}"
chunk_id: "{unique_id}"
chunk_type: "{chunk.node_type}"
chunk_name: "{chunk.name}"
start_line: {chunk.start_line}
end_line: {chunk.end_line}
token_count: {chunk.token_count}
depth: {chunk.depth}
level: {chunk.level}
language: "rst"
references: {chunk_references}
---

# {chunk.name}

**Type:** {chunk.node_type}  
**Tokens:** {chunk.token_count}  
**Depth:** {chunk.depth}  
**Level:** {chunk.level}

```rst
{chunk.content}
```
"""
    return frontmatter


def save_chunks_to_files(chunks: List[RSTChunk], 
                        original_file_path: Path, 
                        input_directory: Path,
                        output_base: Path,
                        references: List[RSTReference]) -> List[str]:
    """Save chunks as markdown files maintaining directory structure"""
    if not chunks:
        return []
    
    # Calculate relative path from input directory
    try:
        rel_path = original_file_path.relative_to(input_directory)
    except ValueError:
        # If file is not under input directory, use just the filename
        rel_path = original_file_path.name
    
    # Create output directory structure
    output_dir = output_base / rel_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Generate unique ID for this file
    file_unique_id = generate_unique_id()
    
    saved_files = []
    
    # Add chunk index to each chunk and save
    for i, chunk in enumerate(chunks, 1):
        # Create chunk filename
        chunk_filename = create_chunk_filename(
            original_file_path.name, 
            i, 
            file_unique_id
        )
        
        # Create markdown content
        markdown_content = create_chunk_markdown(
            chunk, 
            str(rel_path), 
            references
        )
        
        # Write to file
        chunk_file_path = output_dir / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            saved_files.append(str(chunk_file_path))
            print(f"  ✅ Saved: {chunk_filename}")
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    return saved_files


def print_chunk_summary(chunks: List[RSTChunk], file_name: str):
    """Print detailed summary of chunks"""
    print(f"\n--- RST Chunk Summary for {file_name} ---")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"{indent}{i}. {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type} | Level: {chunk.level} | Lines: {content_lines} | Tokens: {chunk.token_count}")
        
        # Show content preview
        preview = chunk.content[:200].replace('\n', ' ').strip()
        if len(chunk.content) > 200:
            preview += "..."
        print(f"{indent}   Preview: {preview}")


def save_chunks_as_markdown(chunks: List[RSTChunk], output_dir: str = "rst_chunks") -> None:
    """Save chunks as markdown files (notebook version)"""
    from pathlib import Path
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    file_unique_id = generate_unique_id()
    saved_count = 0
    
    print(f"\n💾 Saving {len(chunks)} chunks to {output_path}/")
    
    for i, chunk in enumerate(chunks, 1):
        # Create chunk filename
        chunk_filename = f"chunk_{i:03d}_{file_unique_id}.md"
        
        # Create markdown content
        markdown_content = create_chunk_markdown(chunk, "notebook_content.rst", [])
        
        # Write to file
        chunk_file_path = output_path / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            print(f"  ✅ Saved: {chunk_filename}")
            saved_count += 1
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    print(f"✅ Successfully saved {saved_count} chunk files")


# =============================================================================
# JUPYTER NOTEBOOK FUNCTIONS
# =============================================================================

def process_rst_content(rst_content: str, file_name: str = "content.rst") -> List[RSTChunk]:
    """
    Process RST content directly (for Jupyter notebooks).
    
    Args:
        rst_content: Raw RST content as string
        file_name: Name to use for the content (for display purposes)
    
    Returns:
        List of RSTChunk objects
    """
    try:
        # Use the approach you suggested: pre-process to remove includes
        rst_input = remove_include_directives(rst_content)
        
        # Parse with docutils using the pattern you mentioned
        from docutils.frontend import get_default_settings
        
        settings = get_default_settings(Parser)
        settings.report_level = 4  # Show errors but not warnings
        settings.halt_level = 5    # Don't halt on errors
        settings.warning_stream = None  # Suppress warnings
        
        document = new_document(file_name, settings=settings)
        parser = Parser()
        
        try:
            parser.parse(rst_input, document)
            print(f"✅ Successfully parsed RST structure for {file_name}")
        except Exception as parse_error:
            print(f"⚠️ Parse error in {file_name}: {parse_error}")
            # Continue with whatever was parsed
        
        if not document.children:
            print(f"⚠️ Empty document: {file_name}")
            return []
        
        # Extract references from original content (not the modified one)
        references = extract_references_from_source(rst_content)
        if references:
            print(f"📎 Found {len(references)} references/includes")
            for ref in references:
                print(f"  - {ref.reference_type}: {ref.target}")
        
        # Find semantic chunks
        semantic_nodes = extract_semantic_chunks(document, rst_content)
        print(f"Found {len(semantic_nodes)} semantic units")
        
        # Show what we found
        for node in semantic_nodes:
            preview = node['content'][:100].replace('\n', ' ').strip()
            print(f"  - {node['type']}: {node['name']} ({count_tokens(node['content'])} tokens)")
            print(f"    Preview: {preview}...")
        
        # Create chunks
        base_chunks = create_rst_chunks(semantic_nodes)
        
        # Group small chunks
        base_chunks = group_small_chunks(base_chunks, target_tokens=TARGET_TOKENS)
        print(f"Created {len(base_chunks)} semantic chunks")
        
        # Apply sub-chunking for oversized chunks
        final_chunks = []
        oversized_count = 0
        
        for chunk in base_chunks:
            if chunk.token_count > MAX_CHUNK_TOKENS:
                print(f"  Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
                sub_chunks = sub_chunk_by_lines(chunk, rst_content)
                final_chunks.extend(sub_chunks)
                oversized_count += 1
            else:
                final_chunks.append(chunk)
        
        if oversized_count > 0:
            print(f"Sub-chunked {oversized_count} oversized chunks")
        print(f"Final result: {len(final_chunks)} total chunks")
        
        return final_chunks
        
    except Exception as e:
        print(f"❌ Error processing RST content: {e}")
        import traceback
        traceback.print_exc()
        return []


# =============================================================================
# MAIN PROCESSING
# =============================================================================

def main():
    """Main function for RST semantic chunking"""
    print("🚀 RST (reStructuredText) Semantic Chunking")
    print(f"Max chunk tokens: {MAX_CHUNK_TOKENS}")
    print(f"Target tokens for grouping: {TARGET_TOKENS}")
    
    # Get directory from user or use current directory
    directory = input("\nEnter source directory path (or press Enter for current directory): ").strip()
    if not directory:
        directory = "."
    
    target_dir = Path(directory).resolve()
    if not target_dir.exists():
        print(f"❌ Directory not found: {directory}")
        return
    
    # Create output directory parallel to source directory
    output_dir = target_dir.parent / f"{target_dir.name}_rst_chunks"
    output_dir.mkdir(exist_ok=True)
    print(f"📁 Output directory: {output_dir}")
    
    target_path = target_dir
    input_directory = target_dir
    
    # Collect RST files
    rst_files = []
    for ext in ['*.rst', '*.txt']:
        rst_files.extend(target_path.rglob(ext))
    
    # Filter to actual RST files by checking content
    actual_rst_files = []
    for file in rst_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                content = f.read(1000)  # Check first 1000 chars
                # Simple heuristic: look for RST-like content
                if any(marker in content for marker in ['===', '---', '~~~', '^^^', '.. ', '::']):
                    actual_rst_files.append(file)
        except:
            continue
    
    rst_files = actual_rst_files
    
    if not rst_files:
        print(f"❌ No RST files found in {directory}")
        return
    
    print(f"🔍 Found {len(rst_files)} RST file(s)")
    
    # Group by directory for display
    by_dir = {}
    for f in rst_files:
        dir_path = str(f.parent.relative_to(input_directory)) if f.parent != input_directory else '.'
        if dir_path not in by_dir:
            by_dir[dir_path] = []
        by_dir[dir_path].append(f)
    
    # Show files found
    for dir_path, files in sorted(by_dir.items()):
        print(f"  📂 {dir_path}:")
        for f in files:
            print(f"    - {f.name}")
    
    proceed = input(f"\nProcess all {len(rst_files)} files? (y/n): ").strip().lower()
    if proceed != 'y':
        print("❌ Processing cancelled")
        return
    
    # Process all files
    total_chunks = 0
    processed_files = 0
    
    for file_path in rst_files:
        try:
            chunks = process_rst_file(file_path)
            
            if chunks:
                # Save chunks
                references = extract_references_from_source(file_path.read_text(encoding='utf-8'))
                saved_files = save_chunks_to_files(
                    chunks, file_path, input_directory, output_dir, references
                )
                
                total_chunks += len(chunks)
                processed_files += 1
                
                print_chunk_summary(chunks, file_path.name)
            else:
                print(f"⚠️ No chunks generated for {file_path.name}")
        
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    print(f"\n🎉 Processing complete!")
    print(f"✅ Files processed: {processed_files}/{len(rst_files)}")
    print(f"📄 Total chunks created: {total_chunks}")
    print(f"📁 Output directory: {output_dir}")


if __name__ == "__main__":
    main()

t

In [13]:
#!/usr/bin/env python3
"""
Tree-sitter RST Chunker

A clean, robust approach to RST chunking using tree-sitter instead of docutils.
No warning injection, clean error handling, syntax-aware chunking.
"""

import sys
import string
import secrets
import tiktoken
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple

try:
    from tree_sitter_language_pack import get_language, get_parser
    
    # Get RST language from language pack
    try:
        RST_LANGUAGE = tsx_language = get_language('rst')
        print("✅ Found RST language in tree-sitter-language-pack")
    except Exception as e:
        print(f"⚠️ Could not load RST from language pack: {e}")
        print("Will use fallback text-based chunking")
        RST_LANGUAGE = None
        
except ImportError:
    print("❌ tree-sitter-language-pack not installed. Install with: pip install tree-sitter-language-pack")
    print("Will use fallback text-based chunking")
    RST_LANGUAGE = None


# =============================================================================
# CONFIGURATION AND CONSTANTS
# =============================================================================

MAX_CHUNK_TOKENS = 1000
TARGET_TOKENS = 600


# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class RSTChunk:
    """Represents a semantic chunk of RST content"""
    start_byte: int
    end_byte: int
    content: str
    node_type: str
    name: str
    depth: int
    level: int = 0
    token_count: int = 0
    
    def __post_init__(self):
        if self.token_count == 0:
            self.token_count = count_tokens(self.content)


@dataclass
class RSTReference:
    """Represents a reference/include in RST"""
    reference_type: str
    target: str
    start_byte: int
    end_byte: int


# =============================================================================
# TOKEN COUNTING
# =============================================================================

def count_tokens(content: str) -> int:
    """Count tokens using tiktoken for GPT-4"""
    try:
        encoding = tiktoken.encoding_for_model("gpt-4")
        return len(encoding.encode(content))
    except Exception:
        return len(content) // 4


# =============================================================================
# TREE-SITTER PARSING
# =============================================================================

def parse_rst_with_tree_sitter(content: str) -> Optional[object]:
    """Parse RST content using tree-sitter - NO WARNING INJECTION!"""
    if not RST_LANGUAGE:
        return None
    
    try:
        parser = Parser()
        parser.set_language(RST_LANGUAGE)
        
        # Tree-sitter parsing - clean and simple
        tree = parser.parse(content.encode('utf-8'))
        
        # Check for parse errors (but they don't contaminate content!)
        if tree.root_node.has_error:
            print("⚠️ Parse tree contains error nodes (but content stays clean)")
        
        return tree
    except Exception as e:
        print(f"❌ Tree-sitter parsing failed: {e}")
        return None


def extract_node_text(node: object, source_bytes: bytes) -> str:
    """Extract clean text from tree-sitter node"""
    return source_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore')


def find_rst_sections(node: object, source_bytes: bytes, depth: int = 0) -> List[Dict[str, Any]]:
    """Find RST sections in the parse tree"""
    sections = []
    
    # Look for section-like nodes
    if node.type in ['section', 'title', 'heading']:
        text = extract_node_text(node, source_bytes)
        sections.append({
            'type': 'section',
            'name': text.strip().split('\n')[0][:50],  # First line as name
            'start_byte': node.start_byte,
            'end_byte': node.end_byte,
            'content': text,
            'depth': depth,
            'level': depth + 1
        })
    
    # Look for directive nodes (.. note::, .. code-block::, etc.)
    elif node.type in ['directive', 'admonition', 'code_block']:
        text = extract_node_text(node, source_bytes)
        directive_name = text.split('\n')[0].strip()
        sections.append({
            'type': f'{node.type}_directive',
            'name': directive_name,
            'start_byte': node.start_byte,
            'end_byte': node.end_byte,
            'content': text,
            'depth': depth,
            'level': 8  # Lower priority for directives
        })
    
    # Recursively check children
    for child in node.children:
        sections.extend(find_rst_sections(child, source_bytes, depth + 1))
    
    return sections


def extract_references_tree_sitter(node: object, source_bytes: bytes) -> List[RSTReference]:
    """Extract references using tree-sitter - much cleaner than docutils"""
    references = []
    
    # Look for reference-like nodes
    if node.type in ['reference', 'link', 'image', 'include']:
        text = extract_node_text(node, source_bytes)
        ref_type = node.type
        target = text.strip()
        
        references.append(RSTReference(
            reference_type=ref_type,
            target=target,
            start_byte=node.start_byte,
            end_byte=node.end_byte
        ))
    
    # Recursively check children
    for child in node.children:
        references.extend(extract_references_tree_sitter(child, source_bytes))
    
    return references


# =============================================================================
# FALLBACK TEXT-BASED CHUNKING
# =============================================================================

def chunk_rst_by_text_structure(content: str) -> List[Dict[str, Any]]:
    """
    Fallback chunking when tree-sitter isn't available.
    Uses text patterns to identify RST structures.
    """
    chunks = []
    lines = content.split('\n')
    current_chunk_lines = []
    current_start = 0
    
    for i, line in enumerate(lines):
        line_stripped = line.strip()
        
        # Check if this looks like a section header
        is_section_header = False
        if i < len(lines) - 1:
            next_line = lines[i + 1].strip()
            # RST section headers have underlines of =, -, ~, ^, etc.
            if (line_stripped and next_line and 
                all(c in '=-~^"\'`#*+<>' for c in next_line) and
                len(next_line) >= len(line_stripped) * 0.8):
                is_section_header = True
        
        # Check if this is a directive
        is_directive = line_stripped.startswith('.. ') and '::' in line_stripped
        
        # If we hit a section or directive, save previous chunk
        if (is_section_header or is_directive) and current_chunk_lines:
            chunk_content = '\n'.join(current_chunk_lines)
            if chunk_content.strip():
                chunks.append({
                    'type': 'content_block',
                    'name': f'content_block_{len(chunks) + 1}',
                    'start_byte': sum(len(l) + 1 for l in lines[:current_start]),
                    'end_byte': sum(len(l) + 1 for l in lines[:i]),
                    'content': chunk_content,
                    'depth': 0,
                    'level': 1
                })
            current_chunk_lines = []
            current_start = i
        
        current_chunk_lines.append(line)
        
        # If this was a section header, process it
        if is_section_header:
            section_content = '\n'.join(current_chunk_lines)
            chunks.append({
                'type': 'section',
                'name': line_stripped,
                'start_byte': sum(len(l) + 1 for l in lines[:current_start]),
                'end_byte': sum(len(l) + 1 for l in lines[:i + 2]),  # Include underline
                'content': section_content,
                'depth': 0,
                'level': 1
            })
            current_chunk_lines = []
            current_start = i + 2
    
    # Add final chunk
    if current_chunk_lines:
        chunk_content = '\n'.join(current_chunk_lines)
        if chunk_content.strip():
            chunks.append({
                'type': 'content_block',
                'name': f'content_block_{len(chunks) + 1}',
                'start_byte': sum(len(l) + 1 for l in lines[:current_start]),
                'end_byte': len(content),
                'content': chunk_content,
                'depth': 0,
                'level': 1
            })
    
    return chunks


# =============================================================================
# MAIN PROCESSING
# =============================================================================

def process_rst_content_tree_sitter(rst_content: str, file_name: str = "content.rst") -> List[RSTChunk]:
    """
    Process RST content using tree-sitter approach.
    GUARANTEED: No warning injection, clean content extraction.
    """
    print(f"🌳 Processing {file_name} with tree-sitter approach")
    
    # Try tree-sitter parsing first
    semantic_nodes = []
    
    if RST_LANGUAGE:
        tree = parse_rst_with_tree_sitter(rst_content)
        if tree:
            source_bytes = rst_content.encode('utf-8')
            
            # Extract semantic chunks using tree-sitter
            semantic_nodes = find_rst_sections(tree.root_node, source_bytes)
            
            # Extract references
            references = extract_references_tree_sitter(tree.root_node, source_bytes)
            
            print(f"✅ Tree-sitter found {len(semantic_nodes)} semantic units")
            print(f"📎 Found {len(references)} references")
    
    # Fallback to text-based chunking if tree-sitter failed
    if not semantic_nodes:
        print("📝 Falling back to text-based structure detection")
        semantic_nodes = chunk_rst_by_text_structure(rst_content)
        print(f"✅ Text-based chunking found {len(semantic_nodes)} chunks")
    
    # Convert to RSTChunk objects
    chunks = []
    for node_info in semantic_nodes:
        chunk = RSTChunk(
            start_byte=node_info['start_byte'],
            end_byte=node_info['end_byte'],
            content=node_info['content'],
            node_type=node_info['type'],
            name=node_info['name'],
            depth=node_info['depth'],
            level=node_info['level']
        )
        chunks.append(chunk)
    
    # Group small chunks
    chunks = group_small_chunks(chunks, target_tokens=TARGET_TOKENS)
    print(f"Created {len(chunks)} grouped chunks")
    
    # Sub-chunk oversized chunks
    final_chunks = []
    for chunk in chunks:
        if chunk.token_count > MAX_CHUNK_TOKENS:
            print(f"  📦 Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
            sub_chunks = sub_chunk_by_bytes(chunk, rst_content)
            final_chunks.extend(sub_chunks)
        else:
            final_chunks.append(chunk)
    
    print(f"Final result: {len(final_chunks)} clean chunks (no warnings injected!)")
    
    # Verify content cleanliness
    warning_patterns = ['(WARNING/', '(ERROR/', '(INFO/', '(SEVERE/']
    contaminated_chunks = 0
    for chunk in final_chunks:
        if any(pattern in chunk.content for pattern in warning_patterns):
            contaminated_chunks += 1
    
    if contaminated_chunks == 0:
        print("✅ All chunks verified clean - no docutils warnings!")
    else:
        print(f"⚠️ {contaminated_chunks} chunks may contain warnings")
    
    return final_chunks


def group_small_chunks(chunks: List[RSTChunk], target_tokens: int = 600) -> List[RSTChunk]:
    """Group small chunks together"""
    if not chunks:
        return chunks
    
    total_tokens = sum(c.token_count for c in chunks)
    if total_tokens <= MAX_CHUNK_TOKENS:
        combined_content = "\n\n".join(c.content for c in chunks)
        return [RSTChunk(
            start_byte=min(c.start_byte for c in chunks),
            end_byte=max(c.end_byte for c in chunks),
            content=combined_content,
            node_type="combined_document",
            name="complete_document",
            depth=0,
            level=0
        )]
    
    grouped_chunks = []
    current_group = []
    current_tokens = 0
    
    for chunk in chunks:
        if (current_tokens + chunk.token_count > target_tokens and 
            current_group and chunk.token_count <= MAX_CHUNK_TOKENS):
            
            if len(current_group) == 1:
                grouped_chunks.append(current_group[0])
            else:
                combined_content = "\n\n".join(c.content for c in current_group)
                grouped_chunk = RSTChunk(
                    start_byte=min(c.start_byte for c in current_group),
                    end_byte=max(c.end_byte for c in current_group),
                    content=combined_content,
                    node_type="grouped_sections",
                    name=f"sections_{current_group[0].name}_to_{current_group[-1].name}",
                    depth=min(c.depth for c in current_group),
                    level=min(c.level for c in current_group)
                )
                grouped_chunks.append(grouped_chunk)
            
            current_group = [chunk]
            current_tokens = chunk.token_count
        else:
            current_group.append(chunk)
            current_tokens += chunk.token_count
    
    if current_group:
        if len(current_group) == 1:
            grouped_chunks.append(current_group[0])
        else:
            combined_content = "\n\n".join(c.content for c in current_group)
            grouped_chunk = RSTChunk(
                start_byte=min(c.start_byte for c in current_group),
                end_byte=max(c.end_byte for c in current_group),
                content=combined_content,
                node_type="grouped_sections",
                name=f"sections_{current_group[0].name}_to_{current_group[-1].name}",
                depth=min(c.depth for c in current_group),
                level=min(c.level for c in current_group)
            )
            grouped_chunks.append(grouped_chunk)
    
    return grouped_chunks


def sub_chunk_by_bytes(chunk: RSTChunk, rst_content: str) -> List[RSTChunk]:
    """Sub-chunk oversized chunks by byte boundaries"""
    if chunk.token_count <= MAX_CHUNK_TOKENS:
        return [chunk]
    
    content = chunk.content
    lines = content.split('\n')
    sub_chunks = []
    current_lines = []
    current_tokens = 0
    part_num = 1
    
    for line in lines:
        line_tokens = count_tokens(line)
        
        if current_tokens + line_tokens > MAX_CHUNK_TOKENS and current_lines:
            sub_content = '\n'.join(current_lines)
            if sub_content.strip():
                sub_chunk = RSTChunk(
                    start_byte=chunk.start_byte,
                    end_byte=chunk.start_byte + len(sub_content.encode('utf-8')),
                    content=sub_content,
                    node_type=f"{chunk.node_type}_part",
                    name=f"{chunk.name}_part_{part_num}",
                    depth=chunk.depth + 1,
                    level=chunk.level
                )
                sub_chunks.append(sub_chunk)
                part_num += 1
            
            current_lines = [line]
            current_tokens = line_tokens
        else:
            current_lines.append(line)
            current_tokens += line_tokens
    
    if current_lines:
        sub_content = '\n'.join(current_lines)
        if sub_content.strip():
            sub_chunk = RSTChunk(
                start_byte=chunk.end_byte - len(sub_content.encode('utf-8')),
                end_byte=chunk.end_byte,
                content=sub_content,
                node_type=f"{chunk.node_type}_part",
                name=f"{chunk.name}_part_{part_num}",
                depth=chunk.depth + 1,
                level=chunk.level
            )
            sub_chunks.append(sub_chunk)
    
    return sub_chunks if sub_chunks else [chunk]


# =============================================================================
# FILE PROCESSING AND DIRECTORY HANDLING
# =============================================================================

def process_rst_file(file_path: Path) -> List[RSTChunk]:
    """Process a single RST file and return chunks"""
    print(f"\n🔍 Processing: {file_path.name}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            rst_content = f.read()
        
        chunks = process_rst_content_tree_sitter(rst_content, file_path.name)
        
        if chunks:
            print(f"✅ Generated {len(chunks)} chunks for {file_path.name}")
        else:
            print(f"⚠️ No chunks generated for {file_path.name}")
            
        return chunks
        
    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")
        return []


def save_chunks_to_files(chunks: List[RSTChunk], 
                        original_file_path: Path, 
                        input_directory: Path,
                        output_base: Path,
                        references: List[RSTReference]) -> List[str]:
    """Save chunks as markdown files maintaining directory structure"""
    if not chunks:
        return []
    
    # Calculate relative path from input directory
    try:
        rel_path = original_file_path.relative_to(input_directory)
    except ValueError:
        # If file is not under input directory, use just the filename
        rel_path = original_file_path.name
    
    # Create output directory structure
    output_dir = output_base / rel_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Generate unique ID for this file
    file_unique_id = generate_unique_id()
    
    saved_files = []
    
    # Add chunk index to each chunk and save
    for i, chunk in enumerate(chunks, 1):
        # Create chunk filename
        chunk_filename = f"{original_file_path.name}_chunk_{i:03d}_{file_unique_id}.md"
        
        # Create markdown content
        markdown_content = create_chunk_markdown(
            chunk, 
            str(rel_path), 
            references
        )
        
        # Write to file
        chunk_file_path = output_dir / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            saved_files.append(str(chunk_file_path))
            print(f"  ✅ Saved: {chunk_filename}")
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    return saved_files


def print_chunk_summary(chunks: List[RSTChunk], file_name: str):
    """Print detailed summary of chunks"""
    print(f"\n--- RST Chunk Summary for {file_name} ---")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"{indent}{i}. {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type} | Level: {chunk.level} | Lines: {content_lines} | Tokens: {chunk.token_count}")
        
        # Show content preview
        preview = chunk.content[:200].replace('\n', ' ').strip()
        if len(chunk.content) > 200:
            preview += "..."
        print(f"{indent}   Preview: {preview}")


# =============================================================================
# MAIN PROCESSING - SAME INTERFACE AS ORIGINAL
# =============================================================================

def main():
    """Main function for RST semantic chunking - SAME INTERFACE AS ORIGINAL"""
    print("🚀 RST (reStructuredText) Semantic Chunking with Tree-sitter")
    print(f"Max chunk tokens: {MAX_CHUNK_TOKENS}")
    print(f"Target tokens for grouping: {TARGET_TOKENS}")
    print("🌳 Using tree-sitter for clean, warning-free parsing")
    
    # Get directory from user or use current directory - SAME AS ORIGINAL
    directory = input("\nEnter source directory path (or press Enter for current directory): ").strip()
    if not directory:
        directory = "."
    
    target_dir = Path(directory).resolve()
    if not target_dir.exists():
        print(f"❌ Directory not found: {directory}")
        return
    
    # Create output directory parallel to source directory - SAME AS ORIGINAL
    output_dir = target_dir.parent / f"{target_dir.name}_rst_chunks"
    output_dir.mkdir(exist_ok=True)
    print(f"📁 Output directory: {output_dir}")
    
    target_path = target_dir
    input_directory = target_dir
    
    # Collect RST files - SAME AS ORIGINAL
    rst_files = []
    for ext in ['*.rst', '*.txt']:
        rst_files.extend(target_path.rglob(ext))
    
    # Filter to actual RST files by checking content - SAME AS ORIGINAL
    actual_rst_files = []
    for file in rst_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                content = f.read(1000)  # Check first 1000 chars
                # Simple heuristic: look for RST-like content
                if any(marker in content for marker in ['===', '---', '~~~', '^^^', '.. ', '::']):
                    actual_rst_files.append(file)
        except:
            continue
    
    rst_files = actual_rst_files
    
    if not rst_files:
        print(f"❌ No RST files found in {directory}")
        return
    
    print(f"🔍 Found {len(rst_files)} RST file(s)")
    
    # Group by directory for display - SAME AS ORIGINAL
    by_dir = {}
    for f in rst_files:
        dir_path = str(f.parent.relative_to(input_directory)) if f.parent != input_directory else '.'
        if dir_path not in by_dir:
            by_dir[dir_path] = []
        by_dir[dir_path].append(f)
    
    # Show files found - SAME AS ORIGINAL
    for dir_path, files in sorted(by_dir.items()):
        print(f"  📂 {dir_path}:")
        for f in files:
            print(f"    - {f.name}")
    
    proceed = input(f"\nProcess all {len(rst_files)} files? (y/n): ").strip().lower()
    if proceed != 'y':
        print("❌ Processing cancelled")
        return
    
    # Process all files - SAME AS ORIGINAL
    total_chunks = 0
    processed_files = 0
    
    for file_path in rst_files:
        try:
            chunks = process_rst_file(file_path)
            
            if chunks:
                # Extract references for this file
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    references = extract_references_from_source(content)
                except:
                    references = []
                
                # Save chunks - SAME AS ORIGINAL
                saved_files = save_chunks_to_files(
                    chunks, file_path, input_directory, output_dir, references
                )
                
                total_chunks += len(chunks)
                processed_files += 1
                
                print_chunk_summary(chunks, file_path.name)
            else:
                print(f"⚠️ No chunks generated for {file_path.name}")
        
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    print(f"\n🎉 Processing complete!")
    print(f"✅ Files processed: {processed_files}/{len(rst_files)}")
    print(f"📄 Total chunks created: {total_chunks}")
    print(f"📁 Output directory: {output_dir}")
    print("🌳 All chunks generated with tree-sitter (no warning contamination!)")


def extract_references_from_source(rst_content: str) -> List[RSTReference]:
    """Extract references from source text as fallback"""
    references = []
    lines = rst_content.split('\n')
    
    for line_num, line in enumerate(lines, 1):
        line = line.strip()
        if line.startswith('..'):
            for ref_type in ['include', 'literalinclude', 'image', 'figure', 'csv-table']:
                if f'.. {ref_type}::' in line:
                    parts = line.split('::', 1)
                    if len(parts) > 1:
                        target = parts[1].strip()
                        references.append(RSTReference(
                            reference_type=ref_type,
                            target=target,
                            start_byte=0,  # Approximate for text-based extraction
                            end_byte=len(line)
                        ))
                    break
    
    return references

def generate_unique_id(length: int = 6) -> str:
    """Generate a random unique ID"""
    alphabet = string.ascii_lowercase + string.digits
    return ''.join(secrets.choice(alphabet) for _ in range(length))


def create_chunk_markdown(chunk: RSTChunk, source_file_path: str, references: List[RSTReference]) -> str:
    """Create markdown content with YAML frontmatter"""
    unique_id = generate_unique_id()
    
    # Filter references that might apply to this chunk
    chunk_references = []
    for ref in references:
        if chunk.start_byte <= ref.start_byte <= chunk.end_byte:
            chunk_references.append(f"{ref.reference_type}: {ref.target}")
    
    frontmatter = f"""---
file_path: "{source_file_path}"
chunk_id: "{unique_id}"
chunk_type: "{chunk.node_type}"
chunk_name: "{chunk.name}"
start_byte: {chunk.start_byte}
end_byte: {chunk.end_byte}
token_count: {chunk.token_count}
depth: {chunk.depth}
level: {chunk.level}
language: "rst"
references: {chunk_references}
parser: "tree-sitter"
---

# {chunk.name}

**Type:** {chunk.node_type}  
**Tokens:** {chunk.token_count}  
**Depth:** {chunk.depth}  
**Level:** {chunk.level}

```rst
{chunk.content}
```
"""
    return frontmatter


def save_chunks_as_markdown(chunks: List[RSTChunk], output_dir: str = "rst_chunks_tree_sitter") -> None:
    """Save chunks as markdown files"""
    from pathlib import Path
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    file_unique_id = generate_unique_id()
    saved_count = 0
    
    print(f"\n💾 Saving {len(chunks)} chunks to {output_path}/")
    
    for i, chunk in enumerate(chunks, 1):
        chunk_filename = f"chunk_{i:03d}_{file_unique_id}.md"
        markdown_content = create_chunk_markdown(chunk, "notebook_content.rst", [])
        
        chunk_file_path = output_path / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            print(f"  ✅ Saved: {chunk_filename}")
            saved_count += 1
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    print(f"✅ Successfully saved {saved_count} clean chunk files")


# =============================================================================
# OUTPUT GENERATION
# =============================================================================

# =============================================================================
# NOTEBOOK INTERFACE (for direct content processing)
# =============================================================================

def save_chunks_as_markdown(chunks: List[RSTChunk], output_dir: str = "rst_chunks_tree_sitter") -> None:
    """Save chunks as markdown files (notebook version)"""
    from pathlib import Path
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    file_unique_id = generate_unique_id()
    saved_count = 0
    
    print(f"\n💾 Saving {len(chunks)} chunks to {output_path}/")
    
    for i, chunk in enumerate(chunks, 1):
        chunk_filename = f"chunk_{i:03d}_{file_unique_id}.md"
        markdown_content = create_chunk_markdown(chunk, "notebook_content.rst", [])
        
        chunk_file_path = output_path / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            print(f"  ✅ Saved: {chunk_filename}")
            saved_count += 1
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    print(f"✅ Successfully saved {saved_count} clean chunk files (no warnings!)")


if __name__ == "__main__":
    main()

✅ Found RST language in tree-sitter-language-pack
🚀 RST (reStructuredText) Semantic Chunking with Tree-sitter
Max chunk tokens: 1000
Target tokens for grouping: 600
📁 Output directory: /Users/tiyadiashok/python-projects/code_chunker/rag_chunks/pre_processed/docs/rst/docutils_rst_chunks
🔍 Found 60 RST file(s)
  📂 docs:
    - index.rst
    - header2.rst
    - header0.rst
    - header.rst
  📂 docs/api:
    - runtime-settings.rst
    - publisher.rst
    - transforms.rst
  📂 docs/dev:
    - policies.rst
    - website.rst
    - distributing.rst
    - runtime-settings-processing.rst
    - pysource.rst
    - testing.rst
    - todo.rst
    - hacking.rst
    - semantics.rst
    - enthought-plan.rst
    - enthought-rfp.rst
    - repository.rst
    - release.rst
  📂 docs/dev/rst:
    - alternatives.rst
    - problems.rst
  📂 docs/eps:
    - index.rst
    - ep-template.rst
    - header.rst
    - ep-001.rst
    - ep-010.rst
  📂 docs/howto:
    - rst-directives.rst
    - html-stylesheets.rst
    - i1

In [None]:
tttt

In [15]:
#!/usr/bin/env python3
"""
Tree-sitter RST Chunker

A clean, robust approach to RST chunking using tree-sitter instead of docutils.
No warning injection, clean error handling, syntax-aware chunking.
"""

import sys
import string
import secrets
import tiktoken
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple

try:
    from tree_sitter_language_pack import get_language, get_parser
    from tree_sitter import Parser
    
    # Get RST language from language pack
    try:
        RST_LANGUAGE = get_language('rst')
        print("✅ Found RST language in tree-sitter-language-pack")
    except Exception as e:
        print(f"⚠️ Could not load RST from language pack: {e}")
        print("Will use fallback text-based chunking")
        RST_LANGUAGE = None
        
except ImportError:
    print("❌ tree-sitter-language-pack not installed. Install with: pip install tree-sitter-language-pack")
    print("Will use fallback text-based chunking")
    RST_LANGUAGE = None


# =============================================================================
# CONFIGURATION AND CONSTANTS
# =============================================================================

MAX_CHUNK_TOKENS = 1000
TARGET_TOKENS = 600


# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class RSTChunk:
    """Represents a semantic chunk of RST content"""
    start_line: int
    end_line: int
    content: str
    node_type: str
    name: str
    depth: int
    level: int = 0
    token_count: int = 0
    
    def __post_init__(self):
        if self.token_count == 0:
            self.token_count = count_tokens(self.content)


@dataclass
class RSTReference:
    """Represents a reference/include in RST"""
    reference_type: str
    target: str
    start_line: int
    end_line: int


# =============================================================================
# TOKEN COUNTING
# =============================================================================

def count_tokens(content: str) -> int:
    """Count tokens using tiktoken for GPT-4"""
    try:
        encoding = tiktoken.encoding_for_model("gpt-4")
        return len(encoding.encode(content))
    except Exception:
        return len(content) // 4


# =============================================================================
# TREE-SITTER PARSING
# =============================================================================

def parse_rst_with_tree_sitter(content: str) -> Optional[object]:
    """Parse RST content using tree-sitter - NO WARNING INJECTION!"""
    if not RST_LANGUAGE:
        return None
    
    try:
        parser = Parser()
        parser.set_language(RST_LANGUAGE)
        
        # Tree-sitter parsing - clean and simple
        tree = parser.parse(content.encode('utf-8'))
        
        # Check for parse errors (but they don't contaminate content!)
        if tree.root_node.has_error:
            print("⚠️ Parse tree contains error nodes (but content stays clean)")
        
        return tree
    except Exception as e:
        print(f"❌ Tree-sitter parsing failed: {e}")
        return None


def extract_node_text(node: object, source_bytes: bytes) -> str:
    """Extract clean text from tree-sitter node"""
    return source_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore')


def bytes_to_line_numbers(start_byte: int, end_byte: int, source_lines: List[str]) -> Tuple[int, int]:
    """Convert byte positions to line numbers"""
    current_byte = 0
    start_line = 1
    end_line = 1
    
    for line_num, line in enumerate(source_lines, 1):
        line_bytes = len(line.encode('utf-8')) + 1  # +1 for newline
        
        if current_byte <= start_byte < current_byte + line_bytes:
            start_line = line_num
        if current_byte < end_byte <= current_byte + line_bytes:
            end_line = line_num
            break
            
        current_byte += line_bytes
    
    return start_line, end_line


def find_rst_sections(node: object, source_bytes: bytes, depth: int = 0) -> List[Dict[str, Any]]:
    """Find RST sections in the parse tree"""
    sections = []
    source_lines = source_bytes.decode('utf-8', errors='ignore').split('\n')
    
    # Look for section-like nodes
    if node.type in ['section', 'title', 'heading']:
        text = extract_node_text(node, source_bytes)
        start_line, end_line = bytes_to_line_numbers(node.start_byte, node.end_byte, source_lines)
        
        sections.append({
            'type': 'section',
            'name': text.strip().split('\n')[0][:50],  # First line as name
            'start_line': start_line,
            'end_line': end_line,
            'content': text,
            'depth': depth,
            'level': depth + 1
        })
    
    # Look for directive nodes (.. note::, .. code-block::, etc.)
    elif node.type in ['directive', 'admonition', 'code_block']:
        text = extract_node_text(node, source_bytes)
        directive_name = text.split('\n')[0].strip()
        start_line, end_line = bytes_to_line_numbers(node.start_byte, node.end_byte, source_lines)
        
        sections.append({
            'type': f'{node.type}_directive',
            'name': directive_name,
            'start_line': start_line,
            'end_line': end_line,
            'content': text,
            'depth': depth,
            'level': 8  # Lower priority for directives
        })
    
    # Recursively check children
    for child in node.children:
        sections.extend(find_rst_sections(child, source_bytes, depth + 1))
    
    return sections


def extract_references_tree_sitter(node: object, source_bytes: bytes) -> List[RSTReference]:
    """Extract references using tree-sitter - much cleaner than docutils"""
    references = []
    source_lines = source_bytes.decode('utf-8', errors='ignore').split('\n')
    
    # Look for reference-like nodes
    if node.type in ['reference', 'link', 'image', 'include']:
        text = extract_node_text(node, source_bytes)
        ref_type = node.type
        target = text.strip()
        start_line, end_line = bytes_to_line_numbers(node.start_byte, node.end_byte, source_lines)
        
        references.append(RSTReference(
            reference_type=ref_type,
            target=target,
            start_line=start_line,
            end_line=end_line
        ))
    
    # Recursively check children
    for child in node.children:
        references.extend(extract_references_tree_sitter(child, source_bytes))
    
    return references


# =============================================================================
# FALLBACK TEXT-BASED CHUNKING
# =============================================================================

def chunk_rst_by_text_structure(content: str) -> List[Dict[str, Any]]:
    """
    Fallback chunking when tree-sitter isn't available.
    Uses text patterns to identify RST structures.
    """
    chunks = []
    lines = content.split('\n')
    current_chunk_lines = []
    current_start_line = 1
    
    for i, line in enumerate(lines, 1):
        line_stripped = line.strip()
        
        # Check if this looks like a section header
        is_section_header = False
        if i < len(lines):
            next_line = lines[i].strip() if i < len(lines) else ""
            # RST section headers have underlines of =, -, ~, ^, etc.
            if (line_stripped and next_line and 
                all(c in '=-~^"\'`#*+<>' for c in next_line) and
                len(next_line) >= len(line_stripped) * 0.8):
                is_section_header = True
        
        # Check if this is a directive
        is_directive = line_stripped.startswith('.. ') and '::' in line_stripped
        
        # If we hit a section or directive, save previous chunk
        if (is_section_header or is_directive) and current_chunk_lines:
            chunk_content = '\n'.join(current_chunk_lines)
            if chunk_content.strip():
                chunks.append({
                    'type': 'content_block',
                    'name': f'content_block_{len(chunks) + 1}',
                    'start_line': current_start_line,
                    'end_line': i - 1,
                    'content': chunk_content,
                    'depth': 0,
                    'level': 1
                })
            current_chunk_lines = []
            current_start_line = i
        
        current_chunk_lines.append(line)
        
        # If this was a section header, process it
        if is_section_header:
            section_content = '\n'.join(current_chunk_lines)
            chunks.append({
                'type': 'section',
                'name': line_stripped,
                'start_line': current_start_line,
                'end_line': i + 1,  # Include underline
                'content': section_content,
                'depth': 0,
                'level': 1
            })
            current_chunk_lines = []
            current_start_line = i + 2
    
    # Add final chunk
    if current_chunk_lines:
        chunk_content = '\n'.join(current_chunk_lines)
        if chunk_content.strip():
            chunks.append({
                'type': 'content_block',
                'name': f'content_block_{len(chunks) + 1}',
                'start_line': current_start_line,
                'end_line': len(lines),
                'content': chunk_content,
                'depth': 0,
                'level': 1
            })
    
    return chunks


# =============================================================================
# MAIN PROCESSING
# =============================================================================

def process_rst_content_tree_sitter(rst_content: str, file_name: str = "content.rst") -> List[RSTChunk]:
    """
    Process RST content using tree-sitter approach.
    GUARANTEED: No warning injection, clean content extraction.
    """
    print(f"🌳 Processing {file_name} with tree-sitter approach")
    
    # Try tree-sitter parsing first
    semantic_nodes = []
    
    if RST_LANGUAGE:
        tree = parse_rst_with_tree_sitter(rst_content)
        if tree:
            source_bytes = rst_content.encode('utf-8')
            
            # Extract semantic chunks using tree-sitter
            semantic_nodes = find_rst_sections(tree.root_node, source_bytes)
            
            # Extract references
            references = extract_references_tree_sitter(tree.root_node, source_bytes)
            
            print(f"✅ Tree-sitter found {len(semantic_nodes)} semantic units")
            print(f"📎 Found {len(references)} references")
    
    # Fallback to text-based chunking if tree-sitter failed
    if not semantic_nodes:
        print("📝 Falling back to text-based structure detection")
        semantic_nodes = chunk_rst_by_text_structure(rst_content)
        print(f"✅ Text-based chunking found {len(semantic_nodes)} chunks")
    
    # Convert to RSTChunk objects
    chunks = []
    for node_info in semantic_nodes:
        chunk = RSTChunk(
            start_line=node_info['start_line'],
            end_line=node_info['end_line'],
            content=node_info['content'],
            node_type=node_info['type'],
            name=node_info['name'],
            depth=node_info['depth'],
            level=node_info['level']
        )
        chunks.append(chunk)
    
    # Group small chunks
    chunks = group_small_chunks(chunks, target_tokens=TARGET_TOKENS)
    print(f"Created {len(chunks)} grouped chunks")
    
    # Sub-chunk oversized chunks
    final_chunks = []
    for chunk in chunks:
        if chunk.token_count > MAX_CHUNK_TOKENS:
            print(f"  📦 Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
            sub_chunks = sub_chunk_by_bytes(chunk, rst_content)
            final_chunks.extend(sub_chunks)
        else:
            final_chunks.append(chunk)
    
    print(f"Final result: {len(final_chunks)} clean chunks (no warnings injected!)")
    
    # Verify content cleanliness
    warning_patterns = ['(WARNING/', '(ERROR/', '(INFO/', '(SEVERE/']
    contaminated_chunks = 0
    for chunk in final_chunks:
        if any(pattern in chunk.content for pattern in warning_patterns):
            contaminated_chunks += 1
    
    if contaminated_chunks == 0:
        print("✅ All chunks verified clean - no docutils warnings!")
    else:
        print(f"⚠️ {contaminated_chunks} chunks may contain warnings")
    
    return final_chunks


def group_small_chunks(chunks: List[RSTChunk], target_tokens: int = 600) -> List[RSTChunk]:
    """Group small chunks together"""
    if not chunks:
        return chunks
    
    total_tokens = sum(c.token_count for c in chunks)
    if total_tokens <= MAX_CHUNK_TOKENS:
        combined_content = "\n\n".join(c.content for c in chunks)
        return [RSTChunk(
            start_line=min(c.start_line for c in chunks),
            end_line=max(c.end_line for c in chunks),
            content=combined_content,
            node_type="combined_document",
            name="complete_document",
            depth=0,
            level=0
        )]
    
    grouped_chunks = []
    current_group = []
    current_tokens = 0
    
    for chunk in chunks:
        if (current_tokens + chunk.token_count > target_tokens and 
            current_group and chunk.token_count <= MAX_CHUNK_TOKENS):
            
            if len(current_group) == 1:
                grouped_chunks.append(current_group[0])
            else:
                combined_content = "\n\n".join(c.content for c in current_group)
                grouped_chunk = RSTChunk(
                    start_line=min(c.start_line for c in current_group),
                    end_line=max(c.end_line for c in current_group),
                    content=combined_content,
                    node_type="grouped_sections",
                    name=f"sections_{current_group[0].name}_to_{current_group[-1].name}",
                    depth=min(c.depth for c in current_group),
                    level=min(c.level for c in current_group)
                )
                grouped_chunks.append(grouped_chunk)
            
            current_group = [chunk]
            current_tokens = chunk.token_count
        else:
            current_group.append(chunk)
            current_tokens += chunk.token_count
    
    if current_group:
        if len(current_group) == 1:
            grouped_chunks.append(current_group[0])
        else:
            combined_content = "\n\n".join(c.content for c in current_group)
            grouped_chunk = RSTChunk(
                start_byte=min(c.start_byte for c in current_group),
                end_byte=max(c.end_byte for c in current_group),
                content=combined_content,
                node_type="grouped_sections",
                name=f"sections_{current_group[0].name}_to_{current_group[-1].name}",
                depth=min(c.depth for c in current_group),
                level=min(c.level for c in current_group)
            )
            grouped_chunks.append(grouped_chunk)
    
    return grouped_chunks


def sub_chunk_by_bytes(chunk: RSTChunk, rst_content: str) -> List[RSTChunk]:
    """Sub-chunk oversized chunks by line boundaries"""
    if chunk.token_count <= MAX_CHUNK_TOKENS:
        return [chunk]
    
    content = chunk.content
    lines = content.split('\n')
    sub_chunks = []
    current_lines = []
    current_tokens = 0
    part_num = 1
    current_line_num = chunk.start_line
    
    for line in lines:
        line_tokens = count_tokens(line)
        
        if current_tokens + line_tokens > MAX_CHUNK_TOKENS and current_lines:
            sub_content = '\n'.join(current_lines)
            if sub_content.strip():
                sub_chunk = RSTChunk(
                    start_line=current_line_num,
                    end_line=current_line_num + len(current_lines) - 1,
                    content=sub_content,
                    node_type=f"{chunk.node_type}_part",
                    name=f"{chunk.name}_part_{part_num}",
                    depth=chunk.depth + 1,
                    level=chunk.level
                )
                sub_chunks.append(sub_chunk)
                part_num += 1
                current_line_num += len(current_lines)
            
            current_lines = [line]
            current_tokens = line_tokens
        else:
            current_lines.append(line)
            current_tokens += line_tokens
    
    if current_lines:
        sub_content = '\n'.join(current_lines)
        if sub_content.strip():
            sub_chunk = RSTChunk(
                start_line=current_line_num,
                end_line=chunk.end_line,
                content=sub_content,
                node_type=f"{chunk.node_type}_part",
                name=f"{chunk.name}_part_{part_num}",
                depth=chunk.depth + 1,
                level=chunk.level
            )
            sub_chunks.append(sub_chunk)
    
    return sub_chunks if sub_chunks else [chunk]


# =============================================================================
# FILE PROCESSING AND DIRECTORY HANDLING
# =============================================================================

def process_rst_file(file_path: Path) -> List[RSTChunk]:
    """Process a single RST file and return chunks"""
    print(f"\n🔍 Processing: {file_path.name}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            rst_content = f.read()
        
        chunks = process_rst_content_tree_sitter(rst_content, file_path.name)
        
        if chunks:
            print(f"✅ Generated {len(chunks)} chunks for {file_path.name}")
        else:
            print(f"⚠️ No chunks generated for {file_path.name}")
            
        return chunks
        
    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")
        return []


def save_chunks_to_files(chunks: List[RSTChunk], 
                        original_file_path: Path, 
                        input_directory: Path,
                        output_base: Path,
                        references: List[RSTReference]) -> List[str]:
    """Save chunks as markdown files maintaining directory structure"""
    if not chunks:
        return []
    
    # Calculate relative path from input directory
    try:
        rel_path = original_file_path.relative_to(input_directory)
    except ValueError:
        # If file is not under input directory, use just the filename
        rel_path = original_file_path.name
    
    # Create output directory structure
    output_dir = output_base / rel_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Generate unique ID for this file
    file_unique_id = generate_unique_id()
    
    saved_files = []
    
    # Add chunk index to each chunk and save
    for i, chunk in enumerate(chunks, 1):
        # Create chunk filename
        chunk_filename = f"{original_file_path.name}_chunk_{i:03d}_{file_unique_id}.md"
        
        # Create markdown content
        markdown_content = create_chunk_markdown(
            chunk, 
            str(rel_path), 
            references
        )
        
        # Write to file
        chunk_file_path = output_dir / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            saved_files.append(str(chunk_file_path))
            print(f"  ✅ Saved: {chunk_filename}")
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    return saved_files


def print_chunk_summary(chunks: List[RSTChunk], file_name: str):
    """Print detailed summary of chunks"""
    print(f"\n--- RST Chunk Summary for {file_name} ---")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"{indent}{i}. {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type} | Level: {chunk.level} | Lines: {content_lines} | Tokens: {chunk.token_count}")
        
        # Show content preview
        preview = chunk.content[:200].replace('\n', ' ').strip()
        if len(chunk.content) > 200:
            preview += "..."
        print(f"{indent}   Preview: {preview}")


# =============================================================================
# MAIN PROCESSING - SAME INTERFACE AS ORIGINAL
# =============================================================================

def main():
    """Main function for RST semantic chunking - SAME INTERFACE AS ORIGINAL"""
    print("🚀 RST (reStructuredText) Semantic Chunking with Tree-sitter")
    print(f"Max chunk tokens: {MAX_CHUNK_TOKENS}")
    print(f"Target tokens for grouping: {TARGET_TOKENS}")
    print("🌳 Using tree-sitter for clean, warning-free parsing")
    
    # Get directory from user or use current directory - SAME AS ORIGINAL
    directory = input("\nEnter source directory path (or press Enter for current directory): ").strip()
    if not directory:
        directory = "."
    
    target_dir = Path(directory).resolve()
    if not target_dir.exists():
        print(f"❌ Directory not found: {directory}")
        return
    
    # Create output directory parallel to source directory - SAME AS ORIGINAL
    output_dir = target_dir.parent / f"{target_dir.name}_rst_chunks"
    output_dir.mkdir(exist_ok=True)
    print(f"📁 Output directory: {output_dir}")
    
    target_path = target_dir
    input_directory = target_dir
    
    # Collect RST files - SAME AS ORIGINAL
    rst_files = []
    for ext in ['*.rst', '*.txt']:
        rst_files.extend(target_path.rglob(ext))
    
    # Filter to actual RST files by checking content - SAME AS ORIGINAL
    actual_rst_files = []
    for file in rst_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                content = f.read(1000)  # Check first 1000 chars
                # Simple heuristic: look for RST-like content
                if any(marker in content for marker in ['===', '---', '~~~', '^^^', '.. ', '::']):
                    actual_rst_files.append(file)
        except:
            continue
    
    rst_files = actual_rst_files
    
    if not rst_files:
        print(f"❌ No RST files found in {directory}")
        return
    
    print(f"🔍 Found {len(rst_files)} RST file(s)")
    
    # Group by directory for display - SAME AS ORIGINAL
    by_dir = {}
    for f in rst_files:
        dir_path = str(f.parent.relative_to(input_directory)) if f.parent != input_directory else '.'
        if dir_path not in by_dir:
            by_dir[dir_path] = []
        by_dir[dir_path].append(f)
    
    # Show files found - SAME AS ORIGINAL
    for dir_path, files in sorted(by_dir.items()):
        print(f"  📂 {dir_path}:")
        for f in files:
            print(f"    - {f.name}")
    
    proceed = input(f"\nProcess all {len(rst_files)} files? (y/n): ").strip().lower()
    if proceed != 'y':
        print("❌ Processing cancelled")
        return
    
    # Process all files - SAME AS ORIGINAL
    total_chunks = 0
    processed_files = 0
    
    for file_path in rst_files:
        try:
            chunks = process_rst_file(file_path)
            
            if chunks:
                # Extract references for this file
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    references = extract_references_from_source(content)
                except:
                    references = []
                
                # Save chunks - SAME AS ORIGINAL
                saved_files = save_chunks_to_files(
                    chunks, file_path, input_directory, output_dir, references
                )
                
                total_chunks += len(chunks)
                processed_files += 1
                
                print_chunk_summary(chunks, file_path.name)
            else:
                print(f"⚠️ No chunks generated for {file_path.name}")
        
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    print(f"\n🎉 Processing complete!")
    print(f"✅ Files processed: {processed_files}/{len(rst_files)}")
    print(f"📄 Total chunks created: {total_chunks}")
    print(f"📁 Output directory: {output_dir}")
    print("🌳 All chunks generated with tree-sitter (no warning contamination!)")


def extract_references_from_source(rst_content: str) -> List[RSTReference]:
    """Extract references from source text as fallback"""
    references = []
    lines = rst_content.split('\n')
    
    for line_num, line in enumerate(lines, 1):
        line = line.strip()
        if line.startswith('..'):
            for ref_type in ['include', 'literalinclude', 'image', 'figure', 'csv-table']:
                if f'.. {ref_type}::' in line:
                    parts = line.split('::', 1)
                    if len(parts) > 1:
                        target = parts[1].strip()
                        references.append(RSTReference(
                            reference_type=ref_type,
                            target=target,
                            start_line=line_num,
                            end_line=line_num
                        ))
                    break
    
    return references

def generate_unique_id(length: int = 6) -> str:
    """Generate a random unique ID"""
    alphabet = string.ascii_lowercase + string.digits
    return ''.join(secrets.choice(alphabet) for _ in range(length))


def create_chunk_markdown(chunk: RSTChunk, source_file_path: str, references: List[RSTReference]) -> str:
    """Create markdown content with YAML frontmatter"""
    unique_id = generate_unique_id()
    
    # Filter references that might apply to this chunk
    chunk_references = []
    for ref in references:
        if chunk.start_line <= ref.start_line <= chunk.end_line:
            chunk_references.append(f"{ref.reference_type}: {ref.target}")
    
    frontmatter = f"""---
file_path: "{source_file_path}"
chunk_id: "{unique_id}"
chunk_type: "{chunk.node_type}"
chunk_name: "{chunk.name}"
start_line: {chunk.start_line}
end_line: {chunk.end_line}
token_count: {chunk.token_count}
depth: {chunk.depth}
level: {chunk.level}
language: "rst"
references: {chunk_references}
---

# {chunk.name}

```rst
{chunk.content}
```
"""
    return frontmatter


def save_chunks_as_markdown(chunks: List[RSTChunk], output_dir: str = "rst_chunks_tree_sitter") -> None:
    """Save chunks as markdown files"""
    from pathlib import Path
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    file_unique_id = generate_unique_id()
    saved_count = 0
    
    print(f"\n💾 Saving {len(chunks)} chunks to {output_path}/")
    
    for i, chunk in enumerate(chunks, 1):
        chunk_filename = f"chunk_{i:03d}_{file_unique_id}.md"
        markdown_content = create_chunk_markdown(chunk, "notebook_content.rst", [])
        
        chunk_file_path = output_path / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            print(f"  ✅ Saved: {chunk_filename}")
            saved_count += 1
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    print(f"✅ Successfully saved {saved_count} clean chunk files")


# =============================================================================
# OUTPUT GENERATION
# =============================================================================

# =============================================================================
# NOTEBOOK INTERFACE (for direct content processing)
# =============================================================================

def save_chunks_as_markdown(chunks: List[RSTChunk], output_dir: str = "rst_chunks_tree_sitter") -> None:
    """Save chunks as markdown files (notebook version)"""
    from pathlib import Path
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    file_unique_id = generate_unique_id()
    saved_count = 0
    
    print(f"\n💾 Saving {len(chunks)} chunks to {output_path}/")
    
    for i, chunk in enumerate(chunks, 1):
        chunk_filename = f"chunk_{i:03d}_{file_unique_id}.md"
        markdown_content = create_chunk_markdown(chunk, "notebook_content.rst", [])
        
        chunk_file_path = output_path / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            print(f"  ✅ Saved: {chunk_filename}")
            saved_count += 1
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    print(f"✅ Successfully saved {saved_count} clean chunk files (no warnings!)")


if __name__ == "__main__":
    main()

✅ Found RST language in tree-sitter-language-pack
🚀 RST (reStructuredText) Semantic Chunking with Tree-sitter
Max chunk tokens: 1000
Target tokens for grouping: 600
📁 Output directory: /Users/tiyadiashok/python-projects/code_chunker/rag_chunks/pre_processed/docs/rst/docutils_rst_chunks
🔍 Found 60 RST file(s)
  📂 docs:
    - index.rst
    - header2.rst
    - header0.rst
    - header.rst
  📂 docs/api:
    - runtime-settings.rst
    - publisher.rst
    - transforms.rst
  📂 docs/dev:
    - policies.rst
    - website.rst
    - distributing.rst
    - runtime-settings-processing.rst
    - pysource.rst
    - testing.rst
    - todo.rst
    - hacking.rst
    - semantics.rst
    - enthought-plan.rst
    - enthought-rfp.rst
    - repository.rst
    - release.rst
  📂 docs/dev/rst:
    - alternatives.rst
    - problems.rst
  📂 docs/eps:
    - index.rst
    - ep-template.rst
    - header.rst
    - ep-001.rst
    - ep-010.rst
  📂 docs/howto:
    - rst-directives.rst
    - html-stylesheets.rst
    - i1