Core AST Chunking Test Program
This program implements just the essential AST-based chunking functionality for .tsx files. It will:

Parse .tsx files using Tree-sitter TypeScript parser
Identify semantic boundaries (functions, classes, components, etc.)
Create base chunks from AST nodes
Apply sub-chunking when chunks exceed token limits
Print results showing the chunking hierarchy

What's included:

Core Tree-sitter parsing for TypeScript/TSX
Basic AST boundary detection
Token counting for size management
Recursive sub-chunking logic
Simple output display

What's excluded:

Import management
Fallback chunking
Package.json analysis
Error handling
Performance optimization
File validation
Output generation

In [7]:
#!/usr/bin/env python3

import os
from pathlib import Path
from typing import List, Dict, Any
import tiktoken

# Tree-sitter setup for TSX
try:
    from tree_sitter_language_pack import get_language, get_parser
    tsx_language = get_language('tsx')
    parser = get_parser('tsx')
    print("✅ Using TSX parser for React TypeScript files")
except ImportError:
    print("Please install: pip install tree-sitter-languages")
    exit(1)

# Configuration
MAX_CHUNK_TOKENS = 1000
MAX_RECURSION_DEPTH = 3

# Initialize token encoder
encoder = tiktoken.get_encoding("cl100k_base")

def count_tokens(text: str) -> int:
    """Count tokens in text"""
    return len(encoder.encode(text))

class Chunk:
    def __init__(self, start_byte: int, end_byte: int, content: str, node_type: str, name: str, depth: int = 0):
        self.start_byte = start_byte
        self.end_byte = end_byte
        self.content = content
        self.node_type = node_type
        self.name = name
        self.depth = depth
        self.token_count = count_tokens(content)
        self.sub_chunks = []

def extract_node_name(node, source_code: str) -> str:
    """Extract meaningful name from AST node"""
    node_text = source_code[node.start_byte:node.end_byte]
    lines = node_text.split('\n')
    
    import re
    
    # Try different patterns based on node type
    if node.type == 'function_declaration':
        match = re.search(r'function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'variable_declaration':
        # Look for const/let ComponentName = forwardRef or const hook = 
        match = re.search(r'(?:const|let|var)\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'interface_declaration':
        match = re.search(r'interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'type_alias_declaration':
        match = re.search(r'type\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'class_declaration':
        match = re.search(r'class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'export_statement':
        # Handle various export patterns
        if 'export interface' in node_text:
            match = re.search(r'export\s+interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"interface_{match.group(1)}"
        elif 'export const' in node_text:
            match = re.search(r'export\s+const\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"const_{match.group(1)}"
        elif 'export function' in node_text:
            match = re.search(r'export\s+function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"function_{match.group(1)}"
        elif 'export default' in node_text:
            return "default_export"
        else:
            return "export_statement"
    
    # Fallback
    first_line = lines[0][:50].strip()
    simple_match = re.search(r'\b([a-zA-Z_$][a-zA-Z0-9_$]*)', first_line)
    if simple_match:
        return simple_match.group(1)
    
    return f"{node.type}_{node.start_byte}"

def is_top_level_semantic_node(node, parent_types: List[str]) -> bool:
    """Check if this node represents a top-level semantic unit"""
    
    # Top-level declarations
    if node.type in ['function_declaration', 'class_declaration', 'interface_declaration', 'type_alias_declaration']:
        # Make sure it's actually top-level (not nested inside a function)
        return 'function_declaration' not in parent_types and 'method_definition' not in parent_types
    
    # Variable declarations that are likely components or important constants
    if node.type == 'variable_declaration':
        # Check if it's at top level
        if 'function_declaration' not in parent_types and 'method_definition' not in parent_types:
            # Look for patterns like: const Component = forwardRef, const useHook = 
            node_text = get_node_text_preview(node)
            if any(pattern in node_text for pattern in ['forwardRef', 'useState', 'useEffect', 'createContext']):
                return True
    
    # Export statements at top level
    if node.type == 'export_statement':
        return 'function_declaration' not in parent_types and 'method_definition' not in parent_types
    
    return False

def get_node_text_preview(node) -> str:
    """Get a preview of node text for pattern matching"""
    # This would need the source code, but for checking we can use a simpler approach
    return ""

def find_semantic_chunks(tree, source_code: str) -> List[Dict[str, Any]]:
    """Find semantic chunks - complete, meaningful code blocks"""
    semantic_nodes = []
    
    def traverse(node, parent_types=None):
        if parent_types is None:
            parent_types = []
        
        current_parent_types = parent_types + [node.type]
        
        # Check if this is a top-level semantic unit
        node_text = source_code[node.start_byte:node.end_byte]
        
        # Top-level function declarations
        if node.type == 'function_declaration' and len(parent_types) <= 1:
            name = extract_node_name(node, source_code)
            semantic_nodes.append({
                'node': node,
                'name': name,
                'start_byte': node.start_byte,
                'end_byte': node.end_byte,
                'type': node.type,
                'content': node_text
            })
            return  # Don't traverse children - we want the whole function
        
        # Top-level interfaces and types
        elif node.type in ['interface_declaration', 'type_alias_declaration'] and len(parent_types) <= 1:
            name = extract_node_name(node, source_code)
            semantic_nodes.append({
                'node': node,
                'name': name,
                'start_byte': node.start_byte,
                'end_byte': node.end_byte,
                'type': node.type,
                'content': node_text
            })
            return
        
        # Top-level variable declarations (components, hooks, constants)
        elif node.type == 'variable_declaration' and len(parent_types) <= 1:
            # Check if it looks like a component or important declaration
            if any(pattern in node_text for pattern in ['forwardRef', '= (' , 'useState', 'useEffect', 'createContext', 'makePrefixer']):
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node,
                    'name': name,
                    'start_byte': node.start_byte,
                    'end_byte': node.end_byte,
                    'type': 'component_or_hook',
                    'content': node_text
                })
                return
        
        # Top-level export statements (but check what they're exporting)
        elif node.type == 'export_statement' and len(parent_types) <= 1:
            name = extract_node_name(node, source_code)
            # Only include if it's a substantial export (not just a re-export)
            if len(node_text.strip()) > 50:  # Substantial content
                semantic_nodes.append({
                    'node': node,
                    'name': name,
                    'start_byte': node.start_byte,
                    'end_byte': node.end_byte,
                    'type': node.type,
                    'content': node_text
                })
                return
        
        # Continue traversing children for other nodes
        for child in node.children:
            traverse(child, current_parent_types)
    
    traverse(tree.root_node)
    
    # Sort by start position and remove overlaps
    semantic_nodes.sort(key=lambda x: x['start_byte'])
    
    # Remove nested/overlapping nodes
    filtered_nodes = []
    for node in semantic_nodes:
        # Check if this node is contained within any existing node
        is_contained = False
        for existing in filtered_nodes:
            if (existing['start_byte'] <= node['start_byte'] and 
                existing['end_byte'] >= node['end_byte']):
                is_contained = True
                break
        
        if not is_contained:
            # Also check if this node contains any existing nodes
            to_remove = []
            for i, existing in enumerate(filtered_nodes):
                if (node['start_byte'] <= existing['start_byte'] and 
                    node['end_byte'] >= existing['end_byte']):
                    to_remove.append(i)
            
            # Remove contained nodes
            for i in reversed(to_remove):
                filtered_nodes.pop(i)
            
            filtered_nodes.append(node)
    
    return filtered_nodes

def create_semantic_chunks(semantic_nodes: List[Dict[str, Any]]) -> List[Chunk]:
    """Create chunks from semantic nodes"""
    chunks = []
    
    for node_info in semantic_nodes:
        chunk = Chunk(
            start_byte=node_info['start_byte'],
            end_byte=node_info['end_byte'],
            content=node_info['content'],
            node_type=node_info['type'],
            name=node_info['name'],
            depth=0
        )
        chunks.append(chunk)
    
    return chunks

def sub_chunk_by_statements(chunk: Chunk, tree, source_code: str, depth: int = 0) -> List[Chunk]:
    """Sub-chunk by breaking down into logical statements/blocks"""
    if depth >= MAX_RECURSION_DEPTH or chunk.token_count <= MAX_CHUNK_TOKENS:
        return [chunk]
    
    print(f"    Breaking down {chunk.name} ({chunk.token_count} tokens) into smaller pieces...")
    
    # Simple line-based splitting for now
    lines = chunk.content.split('\n')
    sub_chunks = []
    current_lines = []
    current_size = 0
    
    for line in lines:
        line_tokens = count_tokens(line)
        
        if current_size + line_tokens > MAX_CHUNK_TOKENS and current_lines:
            # Create sub-chunk
            sub_content = '\n'.join(current_lines)
            if sub_content.strip():
                sub_chunk = Chunk(
                    start_byte=chunk.start_byte,  # Approximate
                    end_byte=chunk.start_byte + len(sub_content),
                    content=sub_content,
                    node_type=f"{chunk.node_type}_part",
                    name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                    depth=depth + 1
                )
                sub_chunks.append(sub_chunk)
            
            current_lines = [line]
            current_size = line_tokens
        else:
            current_lines.append(line)
            current_size += line_tokens
    
    # Add remaining lines
    if current_lines:
        sub_content = '\n'.join(current_lines)
        if sub_content.strip():
            sub_chunk = Chunk(
                start_byte=chunk.start_byte,
                end_byte=chunk.end_byte,
                content=sub_content,
                node_type=f"{chunk.node_type}_part",
                name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                depth=depth + 1
            )
            sub_chunks.append(sub_chunk)
    
    chunk.sub_chunks = sub_chunks
    return sub_chunks if len(sub_chunks) > 1 else [chunk]

def process_tsx_file(file_path: Path) -> List[Chunk]:
    """Process a single .tsx file and return semantic chunks"""
    print(f"\n=== Processing: {file_path.name} ===")
    
    # Read file
    with open(file_path, 'r', encoding='utf-8') as f:
        source_code = f.read()
    
    print(f"File size: {len(source_code)} characters")
    
    # Parse with Tree-sitter TSX parser
    tree = parser.parse(source_code.encode('utf-8'))
    
    if tree.root_node.has_error:
        print("⚠️ Parse errors detected")
    
    # Find semantic chunks
    semantic_nodes = find_semantic_chunks(tree, source_code)
    print(f"Found {len(semantic_nodes)} semantic units")
    
    # Show what we found
    for node in semantic_nodes:
        preview = node['content'][:100].replace('\n', ' ').strip()
        print(f"  - {node['type']}: {node['name']} ({count_tokens(node['content'])} tokens)")
        print(f"    Preview: {preview}...")
    
    # Create chunks
    base_chunks = create_semantic_chunks(semantic_nodes)
    print(f"Created {len(base_chunks)} semantic chunks")
    
    # Apply sub-chunking for oversized chunks
    final_chunks = []
    oversized_count = 0
    
    for chunk in base_chunks:
        if chunk.token_count > MAX_CHUNK_TOKENS:
            print(f"  Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
            sub_chunks = sub_chunk_by_statements(chunk, tree, source_code)
            final_chunks.extend(sub_chunks)
            oversized_count += 1
        else:
            final_chunks.append(chunk)
    
    if oversized_count > 0:
        print(f"Sub-chunked {oversized_count} oversized chunks")
    print(f"Final result: {len(final_chunks)} total chunks")
    
    return final_chunks

def print_chunk_summary(chunks: List[Chunk], file_name: str):
    """Print detailed summary of chunks"""
    print(f"\n--- Semantic Chunk Summary for {file_name} ---")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"{indent}{i}. {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type}")
        print(f"{indent}   Size: {chunk.token_count} tokens, {content_lines} lines")
        print(f"{indent}   Content preview:")
        
        # Show first few lines of actual content
        content_lines_list = chunk.content.split('\n')
        for j, line in enumerate(content_lines_list[:3]):
            print(f"{indent}     {line.strip()}")
        if len(content_lines_list) > 3:
            print(f"{indent}     ... ({len(content_lines_list) - 3} more lines)")
        print()

def main():
    """Main function to test semantic chunking"""
    print("🚀 Semantic TSX Chunking Test")
    print(f"Max chunk tokens: {MAX_CHUNK_TOKENS}")
    print(f"Max recursion depth: {MAX_RECURSION_DEPTH}")
    
    # Get directory from user or use current directory
    directory = input("\nEnter directory path (or press Enter for current directory): ").strip()
    if not directory:
        directory = "."
    
    target_dir = Path(directory)
    if not target_dir.exists():
        print(f"❌ Directory not found: {directory}")
        return
    
    # Find all .tsx files
    tsx_files = list(target_dir.rglob("*.tsx"))
    
    if not tsx_files:
        print(f"❌ No .tsx files found in {directory}")
        return
    
    print(f"📁 Found {len(tsx_files)} .tsx files")
    
    # Process each file
    all_chunks = []
    for file_path in tsx_files:
        try:
            chunks = process_tsx_file(file_path)
            all_chunks.extend(chunks)
            print_chunk_summary(chunks, file_path.name)
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    # Overall summary
    print(f"\n🎉 OVERALL SUMMARY")
    print(f"Files processed: {len(tsx_files)}")
    print(f"Total chunks created: {len(all_chunks)}")
    
    if all_chunks:
        avg_tokens = sum(c.token_count for c in all_chunks) / len(all_chunks)
        max_tokens = max(c.token_count for c in all_chunks)
        min_tokens = min(c.token_count for c in all_chunks)
        
        print(f"Average chunk size: {avg_tokens:.0f} tokens")
        print(f"Largest chunk: {max_tokens} tokens")
        print(f"Smallest chunk: {min_tokens} tokens")
        
        # Count by node type
        type_counts = {}
        for chunk in all_chunks:
            type_counts[chunk.node_type] = type_counts.get(chunk.node_type, 0) + 1
        
        print(f"\nChunk types:")
        for node_type, count in sorted(type_counts.items()):
            print(f"  {node_type}: {count}")
        
        print("\nChunk Data:")
        for chunk in all_chunks:
            print(f"\nChunk Name: {chunk.name}")
            print(f"\nChunk Content: {chunk.content}")
            print("\n============================")


if __name__ == "__main__":
    main()

✅ Using TSX parser for React TypeScript files
🚀 Semantic TSX Chunking Test
Max chunk tokens: 1000
Max recursion depth: 3
📁 Found 2 .tsx files

=== Processing: Accordion.tsx ===
File size: 2780 characters
Found 2 semantic units
  - export_statement: interface_AccordionProps (182 tokens)
    Preview: export interface AccordionProps   extends Omit<ComponentPropsWithoutRef<"div">, "onToggle"> {   /**...
  - export_statement: const_Accordion (353 tokens)
    Preview: export const Accordion = forwardRef<HTMLDivElement, AccordionProps>(   function Accordion(props, ref...
Created 2 semantic chunks
Final result: 2 total chunks

--- Semantic Chunk Summary for Accordion.tsx ---
1. interface_AccordionProps
   Type: export_statement
   Size: 182 tokens, 31 lines
   Content preview:
     export interface AccordionProps
     extends Omit<ComponentPropsWithoutRef<"div">, "onToggle"> {
     /**
     ... (28 more lines)

2. const_Accordion
   Type: export_statement
   Size: 353 tokens, 69 lines
   Conte

The minimal changes I made:

Added SUPPORTED_EXTENSIONS list with all 4 file types
Added get_syntax_highlighting_language() function to map extensions to proper syntax highlighting
Renamed function from process_tsx_file to process_js_ts_file
Updated file discovery to search for all supported extensions using rglob()
Added file type display in processing output
Updated console messages to reflect multi-language support

The TSX parser handles all these file types correctly:

.tsx - Full TypeScript + JSX support
.ts - TypeScript (just ignores JSX parts)
.js - JavaScript (ignores TypeScript-specific syntax)
.jsx - JavaScript + JSX

No other changes needed - the semantic chunking logic works the same across all these file types!

In [9]:
#!/usr/bin/env python3

import os
from pathlib import Path
from typing import List, Dict, Any
import tiktoken

# Tree-sitter setup for TypeScript/JavaScript files
try:
    from tree_sitter_language_pack import get_language, get_parser
    tsx_language = get_language('tsx')
    parser = get_parser('tsx')
    print("✅ Using TSX parser for TypeScript/JavaScript files")
except ImportError:
    print("Please install: pip install tree-sitter-languages")
    exit(1)

# Configuration
MAX_CHUNK_TOKENS = 1000
MAX_RECURSION_DEPTH = 3

# Supported file extensions
SUPPORTED_EXTENSIONS = ['.tsx', '.ts', '.js', '.jsx']

# Initialize token encoder
encoder = tiktoken.get_encoding("cl100k_base")

def count_tokens(text: str) -> int:
    """Count tokens in text"""
    return len(encoder.encode(text))

def get_syntax_highlighting_language(file_extension: str) -> str:
    """Get appropriate syntax highlighting language for markdown output"""
    language_map = {
        '.tsx': 'tsx',
        '.ts': 'typescript', 
        '.js': 'javascript',
        '.jsx': 'jsx'
    }
    return language_map.get(file_extension, 'javascript')

class Chunk:
    def __init__(self, start_byte: int, end_byte: int, content: str, node_type: str, name: str, depth: int = 0):
        self.start_byte = start_byte
        self.end_byte = end_byte
        self.content = content
        self.node_type = node_type
        self.name = name
        self.depth = depth
        self.token_count = count_tokens(content)
        self.sub_chunks = []

def extract_node_name(node, source_code: str) -> str:
    """Extract meaningful name from AST node"""
    node_text = source_code[node.start_byte:node.end_byte]
    lines = node_text.split('\n')
    
    import re
    
    # Try different patterns based on node type
    if node.type == 'function_declaration':
        match = re.search(r'function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'variable_declaration':
        # Look for const/let ComponentName = forwardRef or const hook = 
        match = re.search(r'(?:const|let|var)\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'interface_declaration':
        match = re.search(r'interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'type_alias_declaration':
        match = re.search(r'type\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'class_declaration':
        match = re.search(r'class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'export_statement':
        # Handle various export patterns
        if 'export interface' in node_text:
            match = re.search(r'export\s+interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"interface_{match.group(1)}"
        elif 'export const' in node_text:
            match = re.search(r'export\s+const\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"const_{match.group(1)}"
        elif 'export function' in node_text:
            match = re.search(r'export\s+function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"function_{match.group(1)}"
        elif 'export default' in node_text:
            return "default_export"
        else:
            return "export_statement"
    
    # Fallback
    first_line = lines[0][:50].strip()
    simple_match = re.search(r'\b([a-zA-Z_$][a-zA-Z0-9_$]*)', first_line)
    if simple_match:
        return simple_match.group(1)
    
    return f"{node.type}_{node.start_byte}"

def find_semantic_chunks(tree, source_code: str) -> List[Dict[str, Any]]:
    """Find semantic chunks - complete, meaningful code blocks"""
    semantic_nodes = []
    
    def traverse(node, parent_types=None):
        if parent_types is None:
            parent_types = []
        
        current_parent_types = parent_types + [node.type]
        
        # Check if this is a top-level semantic unit
        node_text = source_code[node.start_byte:node.end_byte]
        
        # Top-level function declarations
        if node.type == 'function_declaration' and len(parent_types) <= 1:
            name = extract_node_name(node, source_code)
            semantic_nodes.append({
                'node': node,
                'name': name,
                'start_byte': node.start_byte,
                'end_byte': node.end_byte,
                'type': node.type,
                'content': node_text
            })
            return  # Don't traverse children - we want the whole function
        
        # Top-level interfaces and types (TypeScript specific)
        elif node.type in ['interface_declaration', 'type_alias_declaration'] and len(parent_types) <= 1:
            name = extract_node_name(node, source_code)
            semantic_nodes.append({
                'node': node,
                'name': name,
                'start_byte': node.start_byte,
                'end_byte': node.end_byte,
                'type': node.type,
                'content': node_text
            })
            return
        
        # Top-level variable declarations (components, hooks, constants)
        elif node.type == 'variable_declaration' and len(parent_types) <= 1:
            # Check if it looks like a component or important declaration
            if any(pattern in node_text for pattern in ['forwardRef', '= (' , 'useState', 'useEffect', 'createContext', 'makePrefixer']):
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node,
                    'name': name,
                    'start_byte': node.start_byte,
                    'end_byte': node.end_byte,
                    'type': 'component_or_hook',
                    'content': node_text
                })
                return
        
        # Top-level export statements (but check what they're exporting)
        elif node.type == 'export_statement' and len(parent_types) <= 1:
            name = extract_node_name(node, source_code)
            # Only include if it's a substantial export (not just a re-export)
            if len(node_text.strip()) > 50:  # Substantial content
                semantic_nodes.append({
                    'node': node,
                    'name': name,
                    'start_byte': node.start_byte,
                    'end_byte': node.end_byte,
                    'type': node.type,
                    'content': node_text
                })
                return
        
        # Continue traversing children for other nodes
        for child in node.children:
            traverse(child, current_parent_types)
    
    traverse(tree.root_node)
    
    # Sort by start position and remove overlaps
    semantic_nodes.sort(key=lambda x: x['start_byte'])
    
    # Remove nested/overlapping nodes
    filtered_nodes = []
    for node in semantic_nodes:
        # Check if this node is contained within any existing node
        is_contained = False
        for existing in filtered_nodes:
            if (existing['start_byte'] <= node['start_byte'] and 
                existing['end_byte'] >= node['end_byte']):
                is_contained = True
                break
        
        if not is_contained:
            # Also check if this node contains any existing nodes
            to_remove = []
            for i, existing in enumerate(filtered_nodes):
                if (node['start_byte'] <= existing['start_byte'] and 
                    node['end_byte'] >= existing['end_byte']):
                    to_remove.append(i)
            
            # Remove contained nodes
            for i in reversed(to_remove):
                filtered_nodes.pop(i)
            
            filtered_nodes.append(node)
    
    return filtered_nodes

def create_semantic_chunks(semantic_nodes: List[Dict[str, Any]]) -> List[Chunk]:
    """Create chunks from semantic nodes"""
    chunks = []
    
    for node_info in semantic_nodes:
        chunk = Chunk(
            start_byte=node_info['start_byte'],
            end_byte=node_info['end_byte'],
            content=node_info['content'],
            node_type=node_info['type'],
            name=node_info['name'],
            depth=0
        )
        chunks.append(chunk)
    
    return chunks

def sub_chunk_by_statements(chunk: Chunk, tree, source_code: str, depth: int = 0) -> List[Chunk]:
    """Sub-chunk by breaking down into logical statements/blocks"""
    if depth >= MAX_RECURSION_DEPTH or chunk.token_count <= MAX_CHUNK_TOKENS:
        return [chunk]
    
    print(f"    Breaking down {chunk.name} ({chunk.token_count} tokens) into smaller pieces...")
    
    # Simple line-based splitting for now
    lines = chunk.content.split('\n')
    sub_chunks = []
    current_lines = []
    current_size = 0
    
    for line in lines:
        line_tokens = count_tokens(line)
        
        if current_size + line_tokens > MAX_CHUNK_TOKENS and current_lines:
            # Create sub-chunk
            sub_content = '\n'.join(current_lines)
            if sub_content.strip():
                sub_chunk = Chunk(
                    start_byte=chunk.start_byte,  # Approximate
                    end_byte=chunk.start_byte + len(sub_content),
                    content=sub_content,
                    node_type=f"{chunk.node_type}_part",
                    name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                    depth=depth + 1
                )
                sub_chunks.append(sub_chunk)
            
            current_lines = [line]
            current_size = line_tokens
        else:
            current_lines.append(line)
            current_size += line_tokens
    
    # Add remaining lines
    if current_lines:
        sub_content = '\n'.join(current_lines)
        if sub_content.strip():
            sub_chunk = Chunk(
                start_byte=chunk.start_byte,
                end_byte=chunk.end_byte,
                content=sub_content,
                node_type=f"{chunk.node_type}_part",
                name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                depth=depth + 1
            )
            sub_chunks.append(sub_chunk)
    
    chunk.sub_chunks = sub_chunks
    return sub_chunks if len(sub_chunks) > 1 else [chunk]

def process_js_ts_file(file_path: Path) -> List[Chunk]:
    """Process a single JavaScript/TypeScript file and return semantic chunks"""
    print(f"\n=== Processing: {file_path.name} ===")
    
    # Read file
    with open(file_path, 'r', encoding='utf-8') as f:
        source_code = f.read()
    
    print(f"File size: {len(source_code)} characters")
    print(f"File type: {file_path.suffix}")
    
    # Parse with Tree-sitter TSX parser (works for all JS/TS variants)
    tree = parser.parse(source_code.encode('utf-8'))
    
    if tree.root_node.has_error:
        print("⚠️ Parse errors detected")
    
    # Find semantic chunks
    semantic_nodes = find_semantic_chunks(tree, source_code)
    print(f"Found {len(semantic_nodes)} semantic units")
    
    # Show what we found
    for node in semantic_nodes:
        preview = node['content'][:100].replace('\n', ' ').strip()
        print(f"  - {node['type']}: {node['name']} ({count_tokens(node['content'])} tokens)")
        print(f"    Preview: {preview}...")
    
    # Create chunks
    base_chunks = create_semantic_chunks(semantic_nodes)
    print(f"Created {len(base_chunks)} semantic chunks")
    
    # Apply sub-chunking for oversized chunks
    final_chunks = []
    oversized_count = 0
    
    for chunk in base_chunks:
        if chunk.token_count > MAX_CHUNK_TOKENS:
            print(f"  Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
            sub_chunks = sub_chunk_by_statements(chunk, tree, source_code)
            final_chunks.extend(sub_chunks)
            oversized_count += 1
        else:
            final_chunks.append(chunk)
    
    if oversized_count > 0:
        print(f"Sub-chunked {oversized_count} oversized chunks")
    print(f"Final result: {len(final_chunks)} total chunks")
    
    return final_chunks

def print_chunk_summary(chunks: List[Chunk], file_name: str):
    """Print detailed summary of chunks"""
    print(f"\n--- Semantic Chunk Summary for {file_name} ---")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"{indent}{i}. {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type}")
        print(f"{indent}   Size: {chunk.token_count} tokens, {content_lines} lines")
        print(f"{indent}   Content preview:")
        
        # Show first few lines of actual content
        content_lines_list = chunk.content.split('\n')
        for j, line in enumerate(content_lines_list[:3]):
            print(f"{indent}     {line.strip()}")
        if len(content_lines_list) > 3:
            print(f"{indent}     ... ({len(content_lines_list) - 3} more lines)")
        print()

def main():
    """Main function to test semantic chunking"""
    print("🚀 Semantic JavaScript/TypeScript Chunking Test")
    print(f"Max chunk tokens: {MAX_CHUNK_TOKENS}")
    print(f"Max recursion depth: {MAX_RECURSION_DEPTH}")
    print(f"Supported extensions: {', '.join(SUPPORTED_EXTENSIONS)}")
    
    # Get directory from user or use current directory
    directory = input("\nEnter directory path (or press Enter for current directory): ").strip()
    if not directory:
        directory = "."
    
    target_dir = Path(directory)
    if not target_dir.exists():
        print(f"❌ Directory not found: {directory}")
        return
    
    # Find all supported files
    all_files = []
    for ext in SUPPORTED_EXTENSIONS:
        files = list(target_dir.rglob(f"*{ext}"))
        all_files.extend(files)
    
    if not all_files:
        print(f"❌ No supported files found in {directory}")
        print(f"Looking for: {', '.join(SUPPORTED_EXTENSIONS)}")
        return
    
    print(f"📁 Found {len(all_files)} supported files:")
    # Group by extension for summary
    by_ext = {}
    for f in all_files:
        ext = f.suffix
        by_ext[ext] = by_ext.get(ext, 0) + 1
    
    for ext, count in sorted(by_ext.items()):
        print(f"  {ext}: {count} files")
    
    # Process each file
    all_chunks = []
    for file_path in all_files:
        try:
            chunks = process_js_ts_file(file_path)
            all_chunks.extend(chunks)
            print_chunk_summary(chunks, file_path.name)
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    # Overall summary
    print(f"\n🎉 OVERALL SUMMARY")
    print(f"Files processed: {len(all_files)}")
    print(f"Total chunks created: {len(all_chunks)}")
    
    if all_chunks:
        avg_tokens = sum(c.token_count for c in all_chunks) / len(all_chunks)
        max_tokens = max(c.token_count for c in all_chunks)
        min_tokens = min(c.token_count for c in all_chunks)
        
        print(f"Average chunk size: {avg_tokens:.0f} tokens")
        print(f"Largest chunk: {max_tokens} tokens")
        print(f"Smallest chunk: {min_tokens} tokens")
        
        # Count by node type
        type_counts = {}
        for chunk in all_chunks:
            type_counts[chunk.node_type] = type_counts.get(chunk.node_type, 0) + 1
        
        print(f"\nChunk types:")
        for node_type, count in sorted(type_counts.items()):
            print(f"  {node_type}: {count}")

if __name__ == "__main__":
    main()

✅ Using TSX parser for TypeScript/JavaScript files
🚀 Semantic JavaScript/TypeScript Chunking Test
Max chunk tokens: 1000
Max recursion depth: 3
Supported extensions: .tsx, .ts, .js, .jsx
📁 Found 7 supported files:
  .ts: 2 files
  .tsx: 5 files

=== Processing: AccordionPanel.tsx ===
File size: 1754 characters
File type: .tsx
Found 2 semantic units
  - export_statement: interface_AccordionPanelProps (36 tokens)
    Preview: export interface AccordionPanelProps extends ComponentPropsWithoutRef<"div"> {   /**    * The conten...
  - export_statement: const_AccordionPanel (269 tokens)
    Preview: export const AccordionPanel = forwardRef<HTMLDivElement, AccordionPanelProps>(   function AccordionP...
Created 2 semantic chunks
Final result: 2 total chunks

--- Semantic Chunk Summary for AccordionPanel.tsx ---
1. interface_AccordionPanelProps
   Type: export_statement
   Size: 36 tokens, 6 lines
   Content preview:
     export interface AccordionPanelProps extends ComponentPropsWithoutRef<"di

Minimal changes made:

Added 3 new parsers: css_parser, html_parser alongside existing tsx_parser
Added PARSER_MAP: Maps file extensions to appropriate parsers
Extended syntax highlighting: Added CSS, SCSS, HTML to language map
Added CSS/HTML semantic logic:

CSS: rule_set, at_rule, keyframes_statement
HTML: element, script_element, style_element (substantial ones only)


Added extraction functions: extract_css_name() and extract_html_name() for meaningful names
Updated main function: Renamed to "Multi-Language" and added file type breakdown

The semantic boundaries chosen:

CSS/SCSS: CSS rules, @media queries, @keyframes animations
HTML: Major elements, script tags, style tags (ignoring small elements)

Now it handles all 7 file types with appropriate semantic chunking for each!

In [16]:
#!/usr/bin/env python3

import os
from pathlib import Path
from typing import List, Dict, Any
import tiktoken

# Tree-sitter setup for multiple languages
try:
    from tree_sitter_language_pack import get_language, get_parser
    tsx_language = get_language('tsx')
    tsx_parser = get_parser('tsx')
    css_language = get_language('css')
    css_parser = get_parser('css')
    html_language = get_language('html')
    html_parser = get_parser('html')
    print("✅ Using TSX, CSS, and HTML parsers")
except ImportError:
    print("Please install: pip install tree-sitter-languages")
    exit(1)

# Configuration
MAX_CHUNK_TOKENS = 1000
MAX_RECURSION_DEPTH = 3

# Supported file extensions and their parsers
SUPPORTED_EXTENSIONS = ['.tsx', '.ts', '.js', '.jsx', '.css', '.scss', '.html']
PARSER_MAP = {
    '.tsx': tsx_parser,
    '.ts': tsx_parser,
    '.js': tsx_parser,
    '.jsx': tsx_parser,
    '.css': css_parser,
    '.scss': css_parser,
    '.html': html_parser
}

# Initialize token encoder
encoder = tiktoken.get_encoding("cl100k_base")

def count_tokens(text: str) -> int:
    """Count tokens in text"""
    return len(encoder.encode(text))

def get_syntax_highlighting_language(file_extension: str) -> str:
    """Get appropriate syntax highlighting language for markdown output"""
    language_map = {
        '.tsx': 'tsx',
        '.ts': 'typescript', 
        '.js': 'javascript',
        '.jsx': 'jsx',
        '.css': 'css',
        '.scss': 'scss',
        '.html': 'html'
    }
    return language_map.get(file_extension, 'text')

class Chunk:
    def __init__(self, start_byte: int, end_byte: int, content: str, node_type: str, name: str, depth: int = 0):
        self.start_byte = start_byte
        self.end_byte = end_byte
        self.content = content
        self.node_type = node_type
        self.name = name
        self.depth = depth
        self.token_count = count_tokens(content)
        self.sub_chunks = []

def extract_node_name(node, source_code: str) -> str:
    """Extract meaningful name from AST node"""
    node_text = source_code[node.start_byte:node.end_byte]
    lines = node_text.split('\n')
    
    import re
    
    # Try different patterns based on node type
    if node.type == 'function_declaration':
        match = re.search(r'function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'variable_declaration':
        # Look for const/let ComponentName = forwardRef or const hook = 
        match = re.search(r'(?:const|let|var)\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'interface_declaration':
        match = re.search(r'interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'type_alias_declaration':
        match = re.search(r'type\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'class_declaration':
        match = re.search(r'class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'export_statement':
        # Handle various export patterns
        if 'export interface' in node_text:
            match = re.search(r'export\s+interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"interface_{match.group(1)}"
        elif 'export const' in node_text:
            match = re.search(r'export\s+const\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"const_{match.group(1)}"
        elif 'export function' in node_text:
            match = re.search(r'export\s+function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"function_{match.group(1)}"
        elif 'export default' in node_text:
            return "default_export"
        else:
            return "export_statement"
    
    # Fallback
    first_line = lines[0][:50].strip()
    simple_match = re.search(r'\b([a-zA-Z_$][a-zA-Z0-9_$]*)', first_line)
    if simple_match:
        return simple_match.group(1)
    
    return f"{node.type}_{node.start_byte}"

def find_semantic_chunks(tree, source_code: str, file_extension: str) -> List[Dict[str, Any]]:
    """Find semantic chunks - complete, meaningful code blocks"""
    semantic_nodes = []
    
    def traverse(node, parent_types=None):
        if parent_types is None:
            parent_types = []
        
        current_parent_types = parent_types + [node.type]
        node_text = source_code[node.start_byte:node.end_byte]
        
        # Handle different file types
        if file_extension in ['.css', '.scss']:
            # CSS/SCSS semantic boundaries - group smaller rules together
            if node.type in ['rule_set', 'at_rule', 'keyframes_statement'] and len(parent_types) <= 1:
                name = extract_css_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': name, 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                })
                return
                
        elif file_extension == '.html':
            # HTML semantic boundaries  
            if node.type in ['element', 'script_element', 'style_element'] and len(parent_types) <= 2:
                name = extract_html_name(node, source_code)
                if len(node_text.strip()) > 100:  # Only substantial elements
                    semantic_nodes.append({
                        'node': node, 'name': name, 'start_byte': node.start_byte,
                        'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                    })
                    return
                    
        else:
            # JavaScript/TypeScript semantic boundaries (existing logic)
            if node.type in ['import_statement', 'import_declaration'] and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': f"import_{name}", 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': 'import_statement', 'content': node_text
                })
                return
            
            elif node.type == 'function_declaration' and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': name, 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                })
                return
            
            elif node.type in ['interface_declaration', 'type_alias_declaration'] and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': name, 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                })
                return
            
            elif node.type == 'variable_declaration' and len(parent_types) <= 1:
                if any(pattern in node_text for pattern in ['forwardRef', '= (' , 'useState', 'useEffect', 'createContext', 'makePrefixer']):
                    name = extract_node_name(node, source_code)
                    semantic_nodes.append({
                        'node': node, 'name': name, 'start_byte': node.start_byte,
                        'end_byte': node.end_byte, 'type': 'component_or_hook', 'content': node_text
                    })
                    return
            
            elif node.type == 'export_statement' and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                if len(node_text.strip()) > 50:
                    semantic_nodes.append({
                        'node': node, 'name': name, 'start_byte': node.start_byte,
                        'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                    })
                    return
        
        # Continue traversing children for other nodes
        for child in node.children:
            traverse(child, current_parent_types)
    
    traverse(tree.root_node)
    
    # Sort by start position and remove overlaps
    semantic_nodes.sort(key=lambda x: x['start_byte'])
    
    # Remove nested/overlapping nodes
    filtered_nodes = []
    for node in semantic_nodes:
        is_contained = False
        for existing in filtered_nodes:
            if (existing['start_byte'] <= node['start_byte'] and 
                existing['end_byte'] >= node['end_byte']):
                is_contained = True
                break
        
        if not is_contained:
            to_remove = []
            for i, existing in enumerate(filtered_nodes):
                if (node['start_byte'] <= existing['start_byte'] and 
                    node['end_byte'] >= existing['end_byte']):
                    to_remove.append(i)
            
            for i in reversed(to_remove):
                filtered_nodes.pop(i)
            
            filtered_nodes.append(node)
    
    return filtered_nodes

def extract_css_name(node, source_code: str) -> str:
    """Extract meaningful name from CSS/SCSS node"""
    import re
    node_text = source_code[node.start_byte:node.end_byte]
    first_line = node_text.split('\n')[0].strip()
    
    if node.type == 'rule_set':
        # Extract selector
        match = re.search(r'^([^{]+)', first_line)
        if match:
            selector = match.group(1).strip()
            return f"rule_{selector[:30].replace(' ', '_')}"
    elif node.type == 'at_rule':
        # Extract @rule name
        match = re.search(r'@(\w+)', first_line)
        if match:
            return f"at_{match.group(1)}"
    elif node.type == 'keyframes_statement':
        match = re.search(r'@keyframes\s+(\w+)', first_line)
        if match:
            return f"keyframes_{match.group(1)}"
    
    return f"{node.type}_{node.start_byte}"

def extract_html_name(node, source_code: str) -> str:
    """Extract meaningful name from HTML node"""
    import re
    node_text = source_code[node.start_byte:node.end_byte]
    first_line = node_text.split('\n')[0].strip()
    
    # Extract tag name and important attributes
    if node.type in ['element', 'script_element', 'style_element']:
        match = re.search(r'<(\w+)(?:\s+[^>]*)?>', first_line)
        if match:
            tag = match.group(1)
            # Look for id or class
            id_match = re.search(r'id=["\']([^"\']+)["\']', first_line)
            class_match = re.search(r'class=["\']([^"\']+)["\']', first_line)
            
            if id_match:
                return f"{tag}#{id_match.group(1)}"
            elif class_match:
                return f"{tag}.{class_match.group(1).split()[0]}"
            else:
                return tag
    
    return f"{node.type}_{node.start_byte}"

def create_semantic_chunks(semantic_nodes: List[Dict[str, Any]]) -> List[Chunk]:
    """Create chunks from semantic nodes"""
    chunks = []
    
    for node_info in semantic_nodes:
        chunk = Chunk(
            start_byte=node_info['start_byte'],
            end_byte=node_info['end_byte'],
            content=node_info['content'],
            node_type=node_info['type'],
            name=node_info['name'],
            depth=0
        )
        chunks.append(chunk)
    
    return chunks

def group_small_chunks(chunks: List[Chunk], target_tokens: int = 600, file_extension: str = '') -> List[Chunk]:
    """Group small chunks together to reach reasonable size for any file type"""
    if not chunks:
        return chunks
    
    # Separate imports from other chunks for JS/TS files
    if file_extension in ['.tsx', '.ts', '.jsx', '.js']:
        import_chunks = [c for c in chunks if c.node_type == 'import_statement']
        other_chunks = [c for c in chunks if c.node_type != 'import_statement']
        
        # Check if everything together is under limit
        total_tokens = sum(c.token_count for c in chunks)
        if total_tokens <= MAX_CHUNK_TOKENS:
            # Combine everything into one chunk
            combined_content = '\n\n'.join(c.content for c in chunks)
            combined_chunk = Chunk(
                start_byte=chunks[0].start_byte,
                end_byte=chunks[-1].end_byte,
                content=combined_content,
                node_type='complete_module',
                name=f"complete_module_{len(chunks)}_parts",
                depth=0
            )
            return [combined_chunk]
        
        # If too large, handle imports separately
        if import_chunks:
            total_import_tokens = sum(c.token_count for c in import_chunks)
            if total_import_tokens <= MAX_CHUNK_TOKENS:
                # Combine all imports into one chunk
                combined_imports = '\n'.join(c.content for c in import_chunks)
                imports_chunk = Chunk(
                    start_byte=import_chunks[0].start_byte,
                    end_byte=import_chunks[-1].end_byte,
                    content=combined_imports,
                    node_type='imports_group',
                    name=f"imports_{len(import_chunks)}_statements",
                    depth=0
                )
                import_chunks = [imports_chunk]
        
        # Group other chunks
        grouped_others = group_chunks_by_size(other_chunks, target_tokens, file_extension)
        
        # Combine imports + other chunks
        return import_chunks + grouped_others
    else:
        # For non-JS files, use original logic
        return group_chunks_by_size(chunks, target_tokens, file_extension)

def group_chunks_by_size(chunks: List[Chunk], target_tokens: int = 600, file_extension: str = '') -> List[Chunk]:
    """Group chunks by size logic"""
    if not chunks:
        return chunks
    
    # Skip grouping if we already have reasonably sized chunks
    if len(chunks) == 1 or any(c.token_count > target_tokens for c in chunks):
        total_tokens = sum(c.token_count for c in chunks)
        if total_tokens <= MAX_CHUNK_TOKENS:
            # All chunks together are still under limit - combine them
            if len(chunks) > 1:
                combined_content = '\n\n'.join(c.content for c in chunks)
                
                # Create descriptive name based on file type
                if file_extension in ['.css', '.scss']:
                    group_name = f"css_styles_{len(chunks)}_rules"
                elif file_extension in ['.tsx', '.ts', '.jsx', '.js']:
                    group_name = f"file_module_{len(chunks)}_exports"
                elif file_extension == '.html':
                    group_name = f"html_content_{len(chunks)}_elements"
                else:
                    group_name = f"file_content_{len(chunks)}_parts"
                
                combined_chunk = Chunk(
                    start_byte=chunks[0].start_byte,
                    end_byte=chunks[-1].end_byte,
                    content=combined_content,
                    node_type='grouped_content',
                    name=group_name,
                    depth=0
                )
                return [combined_chunk]
        return chunks
    
    # Group small chunks
    grouped_chunks = []
    current_group = []
    current_tokens = 0
    
    for chunk in chunks:
        if current_tokens + chunk.token_count > target_tokens and current_group:
            # Finalize current group
            group_content = '\n\n'.join(c.content for c in current_group)
            
            if file_extension in ['.css', '.scss']:
                group_name = f"css_group_{len(grouped_chunks)+1}_{len(current_group)}_rules"
            elif file_extension in ['.tsx', '.ts', '.jsx', '.js']:
                group_name = f"code_group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            elif file_extension == '.html':
                group_name = f"html_group_{len(grouped_chunks)+1}_{len(current_group)}_elements"
            else:
                group_name = f"group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            
            grouped_chunk = Chunk(
                start_byte=current_group[0].start_byte,
                end_byte=current_group[-1].end_byte,
                content=group_content,
                node_type='grouped_content',
                name=group_name,
                depth=0
            )
            grouped_chunks.append(grouped_chunk)
            
            current_group = [chunk]
            current_tokens = chunk.token_count
        else:
            current_group.append(chunk)
            current_tokens += chunk.token_count
    
    # Add remaining chunks
    if current_group:
        if len(current_group) == 1 and current_tokens > target_tokens//2:
            grouped_chunks.append(current_group[0])
        else:
            group_content = '\n\n'.join(c.content for c in current_group)
            
            if file_extension in ['.css', '.scss']:
                group_name = f"css_group_{len(grouped_chunks)+1}_{len(current_group)}_rules"
            elif file_extension in ['.tsx', '.ts', '.jsx', '.js']:
                group_name = f"code_group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            elif file_extension == '.html':
                group_name = f"html_group_{len(grouped_chunks)+1}_{len(current_group)}_elements"
            else:
                group_name = f"group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            
            grouped_chunk = Chunk(
                start_byte=current_group[0].start_byte,
                end_byte=current_group[-1].end_byte,
                content=group_content,
                node_type='grouped_content',
                name=group_name,
                depth=0
            )
            grouped_chunks.append(grouped_chunk)
    
    return grouped_chunks

def sub_chunk_by_statements(chunk: Chunk, tree, source_code: str, depth: int = 0) -> List[Chunk]:
    """Sub-chunk by breaking down into logical statements/blocks"""
    if depth >= MAX_RECURSION_DEPTH or chunk.token_count <= MAX_CHUNK_TOKENS:
        return [chunk]
    
    print(f"    Breaking down {chunk.name} ({chunk.token_count} tokens) into smaller pieces...")
    
    # Simple line-based splitting for now
    lines = chunk.content.split('\n')
    sub_chunks = []
    current_lines = []
    current_size = 0
    
    for line in lines:
        line_tokens = count_tokens(line)
        
        if current_size + line_tokens > MAX_CHUNK_TOKENS and current_lines:
            # Create sub-chunk
            sub_content = '\n'.join(current_lines)
            if sub_content.strip():
                sub_chunk = Chunk(
                    start_byte=chunk.start_byte,  # Approximate
                    end_byte=chunk.start_byte + len(sub_content),
                    content=sub_content,
                    node_type=f"{chunk.node_type}_part",
                    name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                    depth=depth + 1
                )
                sub_chunks.append(sub_chunk)
            
            current_lines = [line]
            current_size = line_tokens
        else:
            current_lines.append(line)
            current_size += line_tokens
    
    # Add remaining lines
    if current_lines:
        sub_content = '\n'.join(current_lines)
        if sub_content.strip():
            sub_chunk = Chunk(
                start_byte=chunk.start_byte,
                end_byte=chunk.end_byte,
                content=sub_content,
                node_type=f"{chunk.node_type}_part",
                name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                depth=depth + 1
            )
            sub_chunks.append(sub_chunk)
    
    chunk.sub_chunks = sub_chunks
    return sub_chunks if len(sub_chunks) > 1 else [chunk]

def process_file(file_path: Path) -> List[Chunk]:
    """Process a single file and return semantic chunks"""
    print(f"\n=== Processing: {file_path.name} ===")
    
    # Read file
    with open(file_path, 'r', encoding='utf-8') as f:
        source_code = f.read()
    
    print(f"File size: {len(source_code)} characters")
    print(f"File type: {file_path.suffix}")
    
    # Get appropriate parser
    parser = PARSER_MAP.get(file_path.suffix)
    if not parser:
        print(f"❌ No parser available for {file_path.suffix}")
        return []
    
    # Parse with appropriate parser
    tree = parser.parse(source_code.encode('utf-8'))
    
    if tree.root_node.has_error:
        print("⚠️ Parse errors detected")
    
    # Find semantic chunks
    semantic_nodes = find_semantic_chunks(tree, source_code, file_path.suffix)
    print(f"Found {len(semantic_nodes)} semantic units")
    
    # Show what we found
    for node in semantic_nodes:
        preview = node['content'][:100].replace('\n', ' ').strip()
        print(f"  - {node['type']}: {node['name']} ({count_tokens(node['content'])} tokens)")
        print(f"    Preview: {preview}...")
    
    # Create chunks
    base_chunks = create_semantic_chunks(semantic_nodes)
    
    # Group small chunks for all file types
    base_chunks = group_small_chunks(base_chunks, target_tokens=600, file_extension=file_path.suffix)
    
    print(f"Created {len(base_chunks)} semantic chunks")
    
    # Apply sub-chunking for oversized chunks
    final_chunks = []
    oversized_count = 0
    
    for chunk in base_chunks:
        if chunk.token_count > MAX_CHUNK_TOKENS:
            print(f"  Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
            sub_chunks = sub_chunk_by_statements(chunk, tree, source_code)
            final_chunks.extend(sub_chunks)
            oversized_count += 1
        else:
            final_chunks.append(chunk)
    
    if oversized_count > 0:
        print(f"Sub-chunked {oversized_count} oversized chunks")
    print(f"Final result: {len(final_chunks)} total chunks")
    
    return final_chunks

def print_chunk_summary(chunks: List[Chunk], file_name: str):
    """Print detailed summary of chunks"""
    print(f"\n--- Semantic Chunk Summary for {file_name} ---")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"{indent}{i}. {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type}")
        print(f"{indent}   Size: {chunk.token_count} tokens, {content_lines} lines")
        print(f"{indent}   Content preview:")
        
        # Show first few lines of actual content
        content_lines_list = chunk.content.split('\n')
        for j, line in enumerate(content_lines_list[:3]):
            print(f"{indent}     {line.strip()}")
        if len(content_lines_list) > 3:
            print(f"{indent}     ... ({len(content_lines_list) - 3} more lines)")
        print()

def main():
    """Main function to test semantic chunking"""
    print("🚀 Multi-Language Semantic Chunking Test")
    print(f"Max chunk tokens: {MAX_CHUNK_TOKENS}")
    print(f"Max recursion depth: {MAX_RECURSION_DEPTH}")
    print(f"Supported extensions: {', '.join(SUPPORTED_EXTENSIONS)}")
    
    # Get directory from user or use current directory
    directory = input("\nEnter directory path (or press Enter for current directory): ").strip()
    if not directory:
        directory = "."
    
    target_dir = Path(directory)
    if not target_dir.exists():
        print(f"❌ Directory not found: {directory}")
        return
    
    # Find all supported files
    all_files = []
    for ext in SUPPORTED_EXTENSIONS:
        files = list(target_dir.rglob(f"*{ext}"))
        all_files.extend(files)
    
    if not all_files:
        print(f"❌ No supported files found in {directory}")
        print(f"Looking for: {', '.join(SUPPORTED_EXTENSIONS)}")
        return
    
    print(f"📁 Found {len(all_files)} supported files:")
    # Group by extension for summary
    by_ext = {}
    by_dir = {}
    for f in all_files:
        ext = f.suffix
        by_ext[ext] = by_ext.get(ext, 0) + 1
        
        # Track directory depth
        rel_path = f.relative_to(target_dir)
        dir_path = str(rel_path.parent) if rel_path.parent != Path('.') else 'root'
        by_dir[dir_path] = by_dir.get(dir_path, 0) + 1
    
    for ext, count in sorted(by_ext.items()):
        print(f"  {ext}: {count} files")
    
    # Show directory structure if there are subdirectories
    if len(by_dir) > 1 or 'root' not in by_dir:
        print(f"\n📂 Directory distribution:")
        for dir_path, count in sorted(by_dir.items()):
            if dir_path == 'root':
                print(f"  ./: {count} files")
            else:
                depth = dir_path.count('/') if '/' in dir_path else dir_path.count('\\')
                indent = "  " + "  " * depth
                print(f"{indent}{dir_path}/: {count} files")
    
    # Process each file
    all_chunks = []
    for file_path in all_files:
        try:
            chunks = process_file(file_path)
            all_chunks.extend(chunks)
            print_chunk_summary(chunks, file_path.name)
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    # Overall summary
    print(f"\n🎉 OVERALL SUMMARY")
    print(f"Files processed: {len(all_files)}")
    print(f"Total chunks created: {len(all_chunks)}")
    
    if all_chunks:
        avg_tokens = sum(c.token_count for c in all_chunks) / len(all_chunks)
        max_tokens = max(c.token_count for c in all_chunks)
        min_tokens = min(c.token_count for c in all_chunks)
        
        print(f"Average chunk size: {avg_tokens:.0f} tokens")
        print(f"Largest chunk: {max_tokens} tokens")
        print(f"Smallest chunk: {min_tokens} tokens")
        
        # Count by node type
        type_counts = {}
        for chunk in all_chunks:
            type_counts[chunk.node_type] = type_counts.get(chunk.node_type, 0) + 1
        
        print(f"\nChunk types:")
        for node_type, count in sorted(type_counts.items()):
            print(f"  {node_type}: {count}")
            
        # Count by file type
        print(f"\nChunks by file type:")
        file_type_chunks = {}
        # We need to track file types in chunks - simplified approach
        for ext, count in sorted(by_ext.items()):
            # Estimate chunks per file type based on proportions
            proportion = count / len(all_files)
            estimated_chunks = int(len(all_chunks) * proportion)
            file_type_chunks[ext] = estimated_chunks
            print(f"  {ext}: ~{estimated_chunks} chunks")

if __name__ == "__main__":
    main()

✅ Using TSX, CSS, and HTML parsers
🚀 Multi-Language Semantic Chunking Test
Max chunk tokens: 1000
Max recursion depth: 3
Supported extensions: .tsx, .ts, .js, .jsx, .css, .scss, .html


📁 Found 358 supported files:
  .css: 93 files
  .ts: 129 files
  .tsx: 136 files

📂 Directory distribution:
  src/: 3 files
    src/accordion/: 10 files
    src/aria-announcer/: 5 files
    src/avatar/: 4 files
    src/badge/: 3 files
    src/banner/: 7 files
    src/border-item/: 3 files
    src/border-layout/: 2 files
    src/breakpoints/: 3 files
    src/button/: 4 files
    src/card/: 3 files
    src/checkbox/: 7 files
      src/checkbox/internal/: 2 files
    src/combo-box/: 4 files
    src/dialog/: 12 files
    src/divider/: 3 files
    src/drawer/: 5 files
    src/dropdown/: 3 files
    src/file-drop-zone/: 5 files
      src/file-drop-zone/internal/: 1 files
    src/flex-item/: 3 files
    src/flex-layout/: 4 files
    src/flow-layout/: 2 files
    src/form-field/: 7 files
    src/form-field-context/: 3 files
    src/grid-item/: 3 files
    src/grid-layout/: 3 files
    src/input/: 3 files
    src/interactable-card/: 7 files
    src/link/: 4 files
    src/link-card/: 3 files
   

Minimal changes made:

Added file saving functions:

generate_unique_id(): Creates 6-character random IDs
create_chunk_filename(): Formats filenames like Accordion.tsx_chunk_001_a1s2d3.md
create_chunk_markdown(): Creates markdown with YAML frontmatter and code blocks
save_chunks_to_files(): Saves chunks maintaining directory structure


Enhanced YAML frontmatter with:

source_file: Relative path from input directory
chunk_index: Chunk number (1, 2, 3...)
chunk_type: Type of chunk (function, import, etc.)
chunk_name: Meaningful name extracted
token_count: Number of tokens


Directory structure preservation:

Creates {input_dir}_chunks parallel to input directory
Maintains same subdirectory structure
Uses relative paths from input directory


Proper markdown formatting:

Code blocks with correct language syntax highlighting
YAML frontmatter with --- delimiters



Now if you have project/src/components/Button.tsx, it will create:

project_chunks/src/components/Button.tsx_chunk_001_a1s2d3.md
With proper TSX syntax highlighting and metadata!

In [21]:
#!/usr/bin/env python3

import os
import secrets
import string
from pathlib import Path
from typing import List, Dict, Any
import tiktoken

# Tree-sitter setup for multiple languages
try:
    from tree_sitter_language_pack import get_language, get_parser
    tsx_language = get_language('tsx')
    tsx_parser = get_parser('tsx')
    css_language = get_language('css')
    css_parser = get_parser('css')
    html_language = get_language('html')
    html_parser = get_parser('html')
    print("✅ Using TSX, CSS, and HTML parsers")
except ImportError:
    print("Please install: pip install tree-sitter-languages")
    exit(1)

# Configuration
MAX_CHUNK_TOKENS = 1000
MAX_RECURSION_DEPTH = 3

# Supported file extensions and their parsers
SUPPORTED_EXTENSIONS = ['.tsx', '.ts', '.js', '.jsx', '.css', '.scss', '.html']
PARSER_MAP = {
    '.tsx': tsx_parser,
    '.ts': tsx_parser,
    '.js': tsx_parser,
    '.jsx': tsx_parser,
    '.css': css_parser,
    '.scss': css_parser,
    '.html': html_parser
}

# Initialize token encoder
encoder = tiktoken.get_encoding("cl100k_base")

def count_tokens(text: str) -> int:
    """Count tokens in text"""
    return len(encoder.encode(text))

def get_syntax_highlighting_language(file_extension: str) -> str:
    """Get appropriate syntax highlighting language for markdown output"""
    language_map = {
        '.tsx': 'tsx',
        '.ts': 'typescript', 
        '.js': 'javascript',
        '.jsx': 'jsx',
        '.css': 'css',
        '.scss': 'scss',
        '.html': 'html'
    }
    return language_map.get(file_extension, 'text')

class Chunk:
    def __init__(self, start_byte: int, end_byte: int, content: str, node_type: str, name: str, depth: int = 0):
        self.start_byte = start_byte
        self.end_byte = end_byte
        self.content = content
        self.node_type = node_type
        self.name = name
        self.depth = depth
        self.token_count = count_tokens(content)
        self.sub_chunks = []

def extract_node_name(node, source_code: str) -> str:
    """Extract meaningful name from AST node"""
    node_text = source_code[node.start_byte:node.end_byte]
    lines = node_text.split('\n')
    
    import re
    
    # Try different patterns based on node type
    if node.type == 'function_declaration':
        match = re.search(r'function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'variable_declaration':
        # Look for const/let ComponentName = forwardRef or const hook = 
        match = re.search(r'(?:const|let|var)\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'interface_declaration':
        match = re.search(r'interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'type_alias_declaration':
        match = re.search(r'type\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'class_declaration':
        match = re.search(r'class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'export_statement':
        # Handle various export patterns
        if 'export interface' in node_text:
            match = re.search(r'export\s+interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"interface_{match.group(1)}"
        elif 'export const' in node_text:
            match = re.search(r'export\s+const\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"const_{match.group(1)}"
        elif 'export function' in node_text:
            match = re.search(r'export\s+function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"function_{match.group(1)}"
        elif 'export default' in node_text:
            return "default_export"
        else:
            return "export_statement"
    
    # Fallback
    first_line = lines[0][:50].strip()
    simple_match = re.search(r'\b([a-zA-Z_$][a-zA-Z0-9_$]*)', first_line)
    if simple_match:
        return simple_match.group(1)
    
    return f"{node.type}_{node.start_byte}"

def find_semantic_chunks(tree, source_code: str, file_extension: str) -> List[Dict[str, Any]]:
    """Find semantic chunks - complete, meaningful code blocks"""
    semantic_nodes = []
    
    def traverse(node, parent_types=None):
        if parent_types is None:
            parent_types = []
        
        current_parent_types = parent_types + [node.type]
        node_text = source_code[node.start_byte:node.end_byte]
        
        # Handle different file types
        if file_extension in ['.css', '.scss']:
            # CSS/SCSS semantic boundaries - group smaller rules together
            if node.type in ['rule_set', 'at_rule', 'keyframes_statement'] and len(parent_types) <= 1:
                name = extract_css_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': name, 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                })
                return
                
        elif file_extension == '.html':
            # HTML semantic boundaries  
            if node.type in ['element', 'script_element', 'style_element'] and len(parent_types) <= 2:
                name = extract_html_name(node, source_code)
                if len(node_text.strip()) > 100:  # Only substantial elements
                    semantic_nodes.append({
                        'node': node, 'name': name, 'start_byte': node.start_byte,
                        'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                    })
                    return
                    
        else:
            # JavaScript/TypeScript semantic boundaries (existing logic)
            if node.type in ['import_statement', 'import_declaration'] and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': f"import_{name}", 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': 'import_statement', 'content': node_text
                })
                return
            
            elif node.type == 'function_declaration' and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': name, 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                })
                return
            
            elif node.type in ['interface_declaration', 'type_alias_declaration'] and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': name, 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                })
                return
            
            elif node.type == 'variable_declaration' and len(parent_types) <= 1:
                if any(pattern in node_text for pattern in ['forwardRef', '= (' , 'useState', 'useEffect', 'createContext', 'makePrefixer']):
                    name = extract_node_name(node, source_code)
                    semantic_nodes.append({
                        'node': node, 'name': name, 'start_byte': node.start_byte,
                        'end_byte': node.end_byte, 'type': 'component_or_hook', 'content': node_text
                    })
                    return
            
            elif node.type == 'export_statement' and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                if len(node_text.strip()) > 50:
                    semantic_nodes.append({
                        'node': node, 'name': name, 'start_byte': node.start_byte,
                        'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                    })
                    return
        
        # Continue traversing children for other nodes
        for child in node.children:
            traverse(child, current_parent_types)
    
    traverse(tree.root_node)
    
    # Sort by start position and remove overlaps
    semantic_nodes.sort(key=lambda x: x['start_byte'])
    
    # Remove nested/overlapping nodes
    filtered_nodes = []
    for node in semantic_nodes:
        is_contained = False
        for existing in filtered_nodes:
            if (existing['start_byte'] <= node['start_byte'] and 
                existing['end_byte'] >= node['end_byte']):
                is_contained = True
                break
        
        if not is_contained:
            to_remove = []
            for i, existing in enumerate(filtered_nodes):
                if (node['start_byte'] <= existing['start_byte'] and 
                    node['end_byte'] >= existing['end_byte']):
                    to_remove.append(i)
            
            for i in reversed(to_remove):
                filtered_nodes.pop(i)
            
            filtered_nodes.append(node)
    
    return filtered_nodes

def extract_css_name(node, source_code: str) -> str:
    """Extract meaningful name from CSS/SCSS node"""
    import re
    node_text = source_code[node.start_byte:node.end_byte]
    first_line = node_text.split('\n')[0].strip()
    
    if node.type == 'rule_set':
        # Extract selector
        match = re.search(r'^([^{]+)', first_line)
        if match:
            selector = match.group(1).strip()
            return f"rule_{selector[:30].replace(' ', '_')}"
    elif node.type == 'at_rule':
        # Extract @rule name
        match = re.search(r'@(\w+)', first_line)
        if match:
            return f"at_{match.group(1)}"
    elif node.type == 'keyframes_statement':
        match = re.search(r'@keyframes\s+(\w+)', first_line)
        if match:
            return f"keyframes_{match.group(1)}"
    
    return f"{node.type}_{node.start_byte}"

def extract_html_name(node, source_code: str) -> str:
    """Extract meaningful name from HTML node"""
    import re
    node_text = source_code[node.start_byte:node.end_byte]
    first_line = node_text.split('\n')[0].strip()
    
    # Extract tag name and important attributes
    if node.type in ['element', 'script_element', 'style_element']:
        match = re.search(r'<(\w+)(?:\s+[^>]*)?>', first_line)
        if match:
            tag = match.group(1)
            # Look for id or class
            id_match = re.search(r'id=["\']([^"\']+)["\']', first_line)
            class_match = re.search(r'class=["\']([^"\']+)["\']', first_line)
            
            if id_match:
                return f"{tag}#{id_match.group(1)}"
            elif class_match:
                return f"{tag}.{class_match.group(1).split()[0]}"
            else:
                return tag
    
    return f"{node.type}_{node.start_byte}"

def create_semantic_chunks(semantic_nodes: List[Dict[str, Any]]) -> List[Chunk]:
    """Create chunks from semantic nodes"""
    chunks = []
    
    for node_info in semantic_nodes:
        chunk = Chunk(
            start_byte=node_info['start_byte'],
            end_byte=node_info['end_byte'],
            content=node_info['content'],
            node_type=node_info['type'],
            name=node_info['name'],
            depth=0
        )
        chunks.append(chunk)
    
    return chunks

def group_small_chunks(chunks: List[Chunk], target_tokens: int = 600, file_extension: str = '') -> List[Chunk]:
    """Group small chunks together to reach reasonable size for any file type"""
    if not chunks:
        return chunks
    
    # Separate imports from other chunks for JS/TS files
    if file_extension in ['.tsx', '.ts', '.jsx', '.js']:
        import_chunks = [c for c in chunks if c.node_type == 'import_statement']
        other_chunks = [c for c in chunks if c.node_type != 'import_statement']
        
        # Check if everything together is under limit
        total_tokens = sum(c.token_count for c in chunks)
        if total_tokens <= MAX_CHUNK_TOKENS:
            # Combine everything into one chunk
            combined_content = '\n\n'.join(c.content for c in chunks)
            combined_chunk = Chunk(
                start_byte=chunks[0].start_byte,
                end_byte=chunks[-1].end_byte,
                content=combined_content,
                node_type='complete_module',
                name=f"complete_module_{len(chunks)}_parts",
                depth=0
            )
            return [combined_chunk]
        
        # If too large, handle imports separately
        if import_chunks:
            total_import_tokens = sum(c.token_count for c in import_chunks)
            if total_import_tokens <= MAX_CHUNK_TOKENS:
                # Combine all imports into one chunk
                combined_imports = '\n'.join(c.content for c in import_chunks)
                imports_chunk = Chunk(
                    start_byte=import_chunks[0].start_byte,
                    end_byte=import_chunks[-1].end_byte,
                    content=combined_imports,
                    node_type='imports_group',
                    name=f"imports_{len(import_chunks)}_statements",
                    depth=0
                )
                import_chunks = [imports_chunk]
        
        # Group other chunks
        grouped_others = group_chunks_by_size(other_chunks, target_tokens, file_extension)
        
        # Combine imports + other chunks
        return import_chunks + grouped_others
    else:
        # For non-JS files, use original logic
        return group_chunks_by_size(chunks, target_tokens, file_extension)

def group_chunks_by_size(chunks: List[Chunk], target_tokens: int = 600, file_extension: str = '') -> List[Chunk]:
    """Group chunks by size logic"""
    if not chunks:
        return chunks
    
    # Skip grouping if we already have reasonably sized chunks
    if len(chunks) == 1 or any(c.token_count > target_tokens for c in chunks):
        total_tokens = sum(c.token_count for c in chunks)
        if total_tokens <= MAX_CHUNK_TOKENS:
            # All chunks together are still under limit - combine them
            if len(chunks) > 1:
                combined_content = '\n\n'.join(c.content for c in chunks)
                
                # Create descriptive name based on file type
                if file_extension in ['.css', '.scss']:
                    group_name = f"css_styles_{len(chunks)}_rules"
                elif file_extension in ['.tsx', '.ts', '.jsx', '.js']:
                    group_name = f"file_module_{len(chunks)}_exports"
                elif file_extension == '.html':
                    group_name = f"html_content_{len(chunks)}_elements"
                else:
                    group_name = f"file_content_{len(chunks)}_parts"
                
                combined_chunk = Chunk(
                    start_byte=chunks[0].start_byte,
                    end_byte=chunks[-1].end_byte,
                    content=combined_content,
                    node_type='grouped_content',
                    name=group_name,
                    depth=0
                )
                return [combined_chunk]
        return chunks
    
    # Group small chunks
    grouped_chunks = []
    current_group = []
    current_tokens = 0
    
    for chunk in chunks:
        if current_tokens + chunk.token_count > target_tokens and current_group:
            # Finalize current group
            group_content = '\n\n'.join(c.content for c in current_group)
            
            if file_extension in ['.css', '.scss']:
                group_name = f"css_group_{len(grouped_chunks)+1}_{len(current_group)}_rules"
            elif file_extension in ['.tsx', '.ts', '.jsx', '.js']:
                group_name = f"code_group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            elif file_extension == '.html':
                group_name = f"html_group_{len(grouped_chunks)+1}_{len(current_group)}_elements"
            else:
                group_name = f"group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            
            grouped_chunk = Chunk(
                start_byte=current_group[0].start_byte,
                end_byte=current_group[-1].end_byte,
                content=group_content,
                node_type='grouped_content',
                name=group_name,
                depth=0
            )
            grouped_chunks.append(grouped_chunk)
            
            current_group = [chunk]
            current_tokens = chunk.token_count
        else:
            current_group.append(chunk)
            current_tokens += chunk.token_count
    
    # Add remaining chunks
    if current_group:
        if len(current_group) == 1 and current_tokens > target_tokens//2:
            grouped_chunks.append(current_group[0])
        else:
            group_content = '\n\n'.join(c.content for c in current_group)
            
            if file_extension in ['.css', '.scss']:
                group_name = f"css_group_{len(grouped_chunks)+1}_{len(current_group)}_rules"
            elif file_extension in ['.tsx', '.ts', '.jsx', '.js']:
                group_name = f"code_group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            elif file_extension == '.html':
                group_name = f"html_group_{len(grouped_chunks)+1}_{len(current_group)}_elements"
            else:
                group_name = f"group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            
            grouped_chunk = Chunk(
                start_byte=current_group[0].start_byte,
                end_byte=current_group[-1].end_byte,
                content=group_content,
                node_type='grouped_content',
                name=group_name,
                depth=0
            )
            grouped_chunks.append(grouped_chunk)
    
    return grouped_chunks

def sub_chunk_by_statements(chunk: Chunk, tree, source_code: str, depth: int = 0) -> List[Chunk]:
    """Sub-chunk by breaking down into logical statements/blocks"""
    if depth >= MAX_RECURSION_DEPTH or chunk.token_count <= MAX_CHUNK_TOKENS:
        return [chunk]
    
    print(f"    Breaking down {chunk.name} ({chunk.token_count} tokens) into smaller pieces...")
    
    # Simple line-based splitting for now
    lines = chunk.content.split('\n')
    sub_chunks = []
    current_lines = []
    current_size = 0
    
    for line in lines:
        line_tokens = count_tokens(line)
        
        if current_size + line_tokens > MAX_CHUNK_TOKENS and current_lines:
            # Create sub-chunk
            sub_content = '\n'.join(current_lines)
            if sub_content.strip():
                sub_chunk = Chunk(
                    start_byte=chunk.start_byte,  # Approximate
                    end_byte=chunk.start_byte + len(sub_content),
                    content=sub_content,
                    node_type=f"{chunk.node_type}_part",
                    name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                    depth=depth + 1
                )
                sub_chunks.append(sub_chunk)
            
            current_lines = [line]
            current_size = line_tokens
        else:
            current_lines.append(line)
            current_size += line_tokens
    
    # Add remaining lines
    if current_lines:
        sub_content = '\n'.join(current_lines)
        if sub_content.strip():
            sub_chunk = Chunk(
                start_byte=chunk.start_byte,
                end_byte=chunk.end_byte,
                content=sub_content,
                node_type=f"{chunk.node_type}_part",
                name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                depth=depth + 1
            )
            sub_chunks.append(sub_chunk)
    
    chunk.sub_chunks = sub_chunks
    return sub_chunks if len(sub_chunks) > 1 else [chunk]

def process_file(file_path: Path) -> List[Chunk]:
    """Process a single file and return semantic chunks"""
    print(f"\n=== Processing: {file_path.name} ===")
    
    # Read file
    with open(file_path, 'r', encoding='utf-8') as f:
        source_code = f.read()
    
    print(f"File size: {len(source_code)} characters")
    print(f"File type: {file_path.suffix}")
    
    # Get appropriate parser
    parser = PARSER_MAP.get(file_path.suffix)
    if not parser:
        print(f"❌ No parser available for {file_path.suffix}")
        return []
    
    # Parse with appropriate parser
    tree = parser.parse(source_code.encode('utf-8'))
    
    if tree.root_node.has_error:
        print("⚠️ Parse errors detected")
    
    # Find semantic chunks
    semantic_nodes = find_semantic_chunks(tree, source_code, file_path.suffix)
    print(f"Found {len(semantic_nodes)} semantic units")
    
    # Show what we found
    for node in semantic_nodes:
        preview = node['content'][:100].replace('\n', ' ').strip()
        print(f"  - {node['type']}: {node['name']} ({count_tokens(node['content'])} tokens)")
        print(f"    Preview: {preview}...")
    
    # Create chunks
    base_chunks = create_semantic_chunks(semantic_nodes)
    
    # Group small chunks for all file types
    base_chunks = group_small_chunks(base_chunks, target_tokens=600, file_extension=file_path.suffix)
    
    print(f"Created {len(base_chunks)} semantic chunks")
    
    # Apply sub-chunking for oversized chunks
    final_chunks = []
    oversized_count = 0
    
    for chunk in base_chunks:
        if chunk.token_count > MAX_CHUNK_TOKENS:
            print(f"  Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
            sub_chunks = sub_chunk_by_statements(chunk, tree, source_code)
            final_chunks.extend(sub_chunks)
            oversized_count += 1
        else:
            final_chunks.append(chunk)
    
    if oversized_count > 0:
        print(f"Sub-chunked {oversized_count} oversized chunks")
    print(f"Final result: {len(final_chunks)} total chunks")
    
    return final_chunks

def generate_unique_id(length: int = 6) -> str:
    """Generate a random unique ID"""
    alphabet = string.ascii_lowercase + string.digits
    return ''.join(secrets.choice(alphabet) for _ in range(length))

def create_chunk_filename(original_filename: str, chunk_number: int, unique_id: str) -> str:
    """Create chunk filename: Accordion.tsx_chunk_001_a1s2d3.md"""
    return f"{original_filename}_chunk_{chunk_number:03d}_{unique_id}.md"

def get_markdown_language(file_extension: str) -> str:
    """Get markdown language for code blocks"""
    lang_map = {
        '.tsx': 'tsx',
        '.ts': 'typescript', 
        '.js': 'javascript',
        '.jsx': 'jsx',
        '.css': 'css',
        '.scss': 'scss',
        '.html': 'html'
    }
    return lang_map.get(file_extension, 'text')

def create_chunk_markdown(chunk: Chunk, source_file_path: str, file_extension: str) -> str:
    """Create markdown content with YAML frontmatter"""
    language = get_markdown_language(file_extension)
    
    markdown_content = f"""---
source_file: {source_file_path}
chunk_index: {getattr(chunk, 'index', 1)}
chunk_type: {chunk.node_type}
chunk_name: {chunk.name}
token_count: {chunk.token_count}
---

# {chunk.name}

```{language}
{chunk.content}
```
"""
    return markdown_content

def save_chunks_to_files(chunks: List[Chunk], 
                        original_file_path: Path, 
                        input_directory: Path,
                        output_base: Path) -> List[str]:
    """Save chunks as markdown files maintaining directory structure"""
    if not chunks:
        return []
    
    # Calculate relative path from input directory
    try:
        rel_path = original_file_path.relative_to(input_directory)
    except ValueError:
        # If file is not under input directory, use just the filename
        rel_path = original_file_path.name
    
    # Create output directory structure
    output_dir = output_base / rel_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Generate unique ID for this file
    file_unique_id = generate_unique_id()
    
    saved_files = []
    
    # Add chunk index to each chunk and save
    for i, chunk in enumerate(chunks, 1):
        chunk.index = i  # Add index attribute
        
        # Create chunk filename
        chunk_filename = create_chunk_filename(
            original_file_path.name, 
            i, 
            file_unique_id
        )
        
        # Create markdown content
        markdown_content = create_chunk_markdown(
            chunk, 
            str(rel_path), 
            original_file_path.suffix
        )
        
        # Write to file
        chunk_file_path = output_dir / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            saved_files.append(str(chunk_file_path))
            print(f"  ✅ Saved: {chunk_filename}")
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    return saved_files
    """Print detailed summary of chunks"""
    print(f"\n--- Semantic Chunk Summary for {file_name} ---")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"{indent}{i}. {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type}")
        print(f"{indent}   Size: {chunk.token_count} tokens, {content_lines} lines")
        print(f"{indent}   Content preview:")
        
        # Show first few lines of actual content
        content_lines_list = chunk.content.split('\n')
        for j, line in enumerate(content_lines_list[:3]):
            print(f"{indent}     {line.strip()}")
        if len(content_lines_list) > 3:
            print(f"{indent}     ... ({len(content_lines_list) - 3} more lines)")
        print()

def main():
    """Main function to test semantic chunking"""
    print("🚀 Multi-Language Semantic Chunking Test")
    print(f"Max chunk tokens: {MAX_CHUNK_TOKENS}")
    print(f"Max recursion depth: {MAX_RECURSION_DEPTH}")
    print(f"Supported extensions: {', '.join(SUPPORTED_EXTENSIONS)}")
    
    # Get directory from user or use current directory
    directory = input("\nEnter directory path (or press Enter for current directory): ").strip()
    if not directory:
        directory = "."
    
    target_dir = Path(directory)
    if not target_dir.exists():
        print(f"❌ Directory not found: {directory}")
        return
    
    # Find all supported files
    all_files = []
    for ext in SUPPORTED_EXTENSIONS:
        files = list(target_dir.rglob(f"*{ext}"))
        all_files.extend(files)
    
    if not all_files:
        print(f"❌ No supported files found in {directory}")
        print(f"Looking for: {', '.join(SUPPORTED_EXTENSIONS)}")
        return
    
    print(f"📁 Found {len(all_files)} supported files:")
    # Group by extension for summary
    by_ext = {}
    by_dir = {}
    for f in all_files:
        ext = f.suffix
        by_ext[ext] = by_ext.get(ext, 0) + 1
        
        # Track directory depth
        rel_path = f.relative_to(target_dir)
        dir_path = str(rel_path.parent) if rel_path.parent != Path('.') else 'root'
        by_dir[dir_path] = by_dir.get(dir_path, 0) + 1
    
    for ext, count in sorted(by_ext.items()):
        print(f"  {ext}: {count} files")
    
    # Show directory structure if there are subdirectories
    if len(by_dir) > 1 or 'root' not in by_dir:
        print(f"\n📂 Directory distribution:")
        for dir_path, count in sorted(by_dir.items()):
            if dir_path == 'root':
                print(f"  ./: {count} files")
            else:
                depth = dir_path.count('/') if '/' in dir_path else dir_path.count('\\')
                indent = "  " + "  " * depth
                print(f"{indent}{dir_path}/: {count} files")
    
    # Process each file
    all_chunks = []
    all_saved_files = []
    
    # Create output directory
    output_directory = target_dir.parent / f"{target_dir.name}_chunks"
    print(f"\n📁 Output directory: {output_directory}")
    
    for file_path in all_files:
        try:
            chunks = process_file(file_path)
            all_chunks.extend(chunks)
            print_chunk_summary(chunks, file_path.name)
            
            # Save chunks to files
            saved_files = save_chunks_to_files(chunks, file_path, target_dir, output_directory)
            all_saved_files.extend(saved_files)
            
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    # Overall summary
    print(f"\n🎉 OVERALL SUMMARY")
    print(f"Files processed: {len(all_files)}")
    print(f"Total chunks created: {len(all_chunks)}")
    print(f"Total chunk files saved: {len(all_saved_files)}")
    print(f"Output directory: {output_directory}")
    
    if all_chunks:
        avg_tokens = sum(c.token_count for c in all_chunks) / len(all_chunks)
        max_tokens = max(c.token_count for c in all_chunks)
        min_tokens = min(c.token_count for c in all_chunks)
        
        print(f"Average chunk size: {avg_tokens:.0f} tokens")
        print(f"Largest chunk: {max_tokens} tokens")
        print(f"Smallest chunk: {min_tokens} tokens")
        
        # Count by node type
        type_counts = {}
        for chunk in all_chunks:
            type_counts[chunk.node_type] = type_counts.get(chunk.node_type, 0) + 1
        
        print(f"\nChunk types:")
        for node_type, count in sorted(type_counts.items()):
            print(f"  {node_type}: {count}")
            
        # Count by file type
        print(f"\nChunks by file type:")
        file_type_chunks = {}
        # We need to track file types in chunks - simplified approach
        for ext, count in sorted(by_ext.items()):
            # Estimate chunks per file type based on proportions
            proportion = count / len(all_files)
            estimated_chunks = int(len(all_chunks) * proportion)
            file_type_chunks[ext] = estimated_chunks
            print(f"  {ext}: ~{estimated_chunks} chunks")

if __name__ == "__main__":
    main()

✅ Using TSX, CSS, and HTML parsers
🚀 Multi-Language Semantic Chunking Test
Max chunk tokens: 1000
Max recursion depth: 3
Supported extensions: .tsx, .ts, .js, .jsx, .css, .scss, .html
📁 Found 358 supported files:
  .css: 93 files
  .ts: 129 files
  .tsx: 136 files

📂 Directory distribution:
  src/: 3 files
    src/accordion/: 10 files
    src/aria-announcer/: 5 files
    src/avatar/: 4 files
    src/badge/: 3 files
    src/banner/: 7 files
    src/border-item/: 3 files
    src/border-layout/: 2 files
    src/breakpoints/: 3 files
    src/button/: 4 files
    src/card/: 3 files
    src/checkbox/: 7 files
      src/checkbox/internal/: 2 files
    src/combo-box/: 4 files
    src/dialog/: 12 files
    src/divider/: 3 files
    src/drawer/: 5 files
    src/dropdown/: 3 files
    src/file-drop-zone/: 5 files
      src/file-drop-zone/internal/: 1 files
    src/flex-item/: 3 files
    src/flex-layout/: 4 files
    src/flow-layout/: 2 files
    src/form-field/: 7 files
    src/form-field-contex

Minimal changes made:

Added .d.ts to supported extensions and parser mapping
Added get_file_extension() function that properly detects .d.ts files (checks if filename ends with .d.ts before falling back to .suffix)
Updated syntax highlighting to map .d.ts to typescript
Updated all extension checks to use the new get_file_extension() function

Now .d.ts files will be:

✅ Discovered during file scanning (as they end with .ts)
✅ Properly identified as .d.ts type
✅ Parsed with TypeScript parser
✅ Highlighted as typescript in markdown
✅ Saved with correct filenames like declarations.d.ts_chunk_001_a1b2c3.md

The issue was that Path.suffix only returns the last extension (.ts) but we needed to detect the full .d.ts pattern!

In [24]:
#!/usr/bin/env python3

import os
import secrets
import string
from pathlib import Path
from typing import List, Dict, Any
import tiktoken

# Tree-sitter setup for multiple languages
try:
    from tree_sitter_language_pack import get_language, get_parser
    tsx_language = get_language('tsx')
    tsx_parser = get_parser('tsx')
    css_language = get_language('css')
    css_parser = get_parser('css')
    html_language = get_language('html')
    html_parser = get_parser('html')
    print("✅ Using TSX, CSS, and HTML parsers")
except ImportError:
    print("Please install: pip install tree-sitter-languages")
    exit(1)

# Configuration
MAX_CHUNK_TOKENS = 1000
MAX_RECURSION_DEPTH = 3

# Supported file extensions and their parsers
SUPPORTED_EXTENSIONS = ['.tsx', '.ts', '.js', '.jsx', '.css', '.scss', '.html']
PARSER_MAP = {
    '.tsx': tsx_parser,
    '.ts': tsx_parser,
    '.js': tsx_parser,
    '.jsx': tsx_parser,
    '.css': css_parser,
    '.scss': css_parser,
    '.html': html_parser
}

# Initialize token encoder
encoder = tiktoken.get_encoding("cl100k_base")

def count_tokens(text: str) -> int:
    """Count tokens in text"""
    return len(encoder.encode(text))

def get_syntax_highlighting_language(file_extension: str) -> str:
    """Get appropriate syntax highlighting language for markdown output"""
    language_map = {
        '.tsx': 'tsx',
        '.ts': 'typescript', 
        '.js': 'javascript',
        '.jsx': 'jsx',
        '.css': 'css',
        '.scss': 'scss',
        '.html': 'html'
    }
    return language_map.get(file_extension, 'text')

class Chunk:
    def __init__(self, start_byte: int, end_byte: int, content: str, node_type: str, name: str, depth: int = 0):
        self.start_byte = start_byte
        self.end_byte = end_byte
        self.content = content
        self.node_type = node_type
        self.name = name
        self.depth = depth
        self.token_count = count_tokens(content)
        self.sub_chunks = []

def extract_node_name(node, source_code: str) -> str:
    """Extract meaningful name from AST node"""
    node_text = source_code[node.start_byte:node.end_byte]
    lines = node_text.split('\n')
    
    import re
    
    # Try different patterns based on node type
    if node.type == 'function_declaration':
        match = re.search(r'function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'variable_declaration':
        # Look for const/let ComponentName = forwardRef or const hook = 
        match = re.search(r'(?:const|let|var)\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'interface_declaration':
        match = re.search(r'interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'type_alias_declaration':
        match = re.search(r'type\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type in ['module_declaration', 'ambient_declaration']:
        # Handle declare module statements
        match = re.search(r'declare\s+module\s+["\']([^"\']+)["\']', node_text)
        if match:
            return f"module_{match.group(1).replace('*', 'wildcard').replace('/', '_')}"
        match = re.search(r'module\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return f"module_{match.group(1)}"
        return "declare_module"
    
    elif node.type == 'class_declaration':
        match = re.search(r'class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
        if match:
            return match.group(1)
    
    elif node.type == 'export_statement':
        # Handle various export patterns
        if 'export interface' in node_text:
            match = re.search(r'export\s+interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"interface_{match.group(1)}"
        elif 'export const' in node_text:
            match = re.search(r'export\s+const\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"const_{match.group(1)}"
        elif 'export function' in node_text:
            match = re.search(r'export\s+function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)', node_text)
            if match:
                return f"function_{match.group(1)}"
        elif 'export default' in node_text:
            return "default_export"
        else:
            return "export_statement"
    
    # Fallback
    first_line = lines[0][:50].strip()
    simple_match = re.search(r'\b([a-zA-Z_$][a-zA-Z0-9_$]*)', first_line)
    if simple_match:
        return simple_match.group(1)
    
    return f"{node.type}_{node.start_byte}"

def find_semantic_chunks(tree, source_code: str, file_extension: str) -> List[Dict[str, Any]]:
    """Find semantic chunks - complete, meaningful code blocks"""
    semantic_nodes = []
    
    def traverse(node, parent_types=None):
        if parent_types is None:
            parent_types = []
        
        current_parent_types = parent_types + [node.type]
        node_text = source_code[node.start_byte:node.end_byte]
        
        # Handle different file types
        if file_extension in ['.css', '.scss']:
            # CSS/SCSS semantic boundaries - group smaller rules together
            if node.type in ['rule_set', 'at_rule', 'keyframes_statement'] and len(parent_types) <= 1:
                name = extract_css_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': name, 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                })
                return
                
        elif file_extension == '.html':
            # HTML semantic boundaries  
            if node.type in ['element', 'script_element', 'style_element'] and len(parent_types) <= 2:
                name = extract_html_name(node, source_code)
                if len(node_text.strip()) > 100:  # Only substantial elements
                    semantic_nodes.append({
                        'node': node, 'name': name, 'start_byte': node.start_byte,
                        'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                    })
                    return
                    
        else:
            # JavaScript/TypeScript semantic boundaries (existing logic)
            if node.type in ['import_statement', 'import_declaration'] and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': f"import_{name}", 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': 'import_statement', 'content': node_text
                })
                return
            
            elif node.type == 'function_declaration' and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': name, 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                })
                return
            
            elif node.type in ['interface_declaration', 'type_alias_declaration', 'module_declaration', 'ambient_declaration'] and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                semantic_nodes.append({
                    'node': node, 'name': name, 'start_byte': node.start_byte,
                    'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                })
                return
            
            elif node.type == 'variable_declaration' and len(parent_types) <= 1:
                if any(pattern in node_text for pattern in ['forwardRef', '= (' , 'useState', 'useEffect', 'createContext', 'makePrefixer']):
                    name = extract_node_name(node, source_code)
                    semantic_nodes.append({
                        'node': node, 'name': name, 'start_byte': node.start_byte,
                        'end_byte': node.end_byte, 'type': 'component_or_hook', 'content': node_text
                    })
                    return
            
            elif node.type == 'export_statement' and len(parent_types) <= 1:
                name = extract_node_name(node, source_code)
                if len(node_text.strip()) > 50:
                    semantic_nodes.append({
                        'node': node, 'name': name, 'start_byte': node.start_byte,
                        'end_byte': node.end_byte, 'type': node.type, 'content': node_text
                    })
                    return
        
        # Continue traversing children for other nodes
        for child in node.children:
            traverse(child, current_parent_types)
    
    traverse(tree.root_node)
    
    # Sort by start position and remove overlaps
    semantic_nodes.sort(key=lambda x: x['start_byte'])
    
    # Remove nested/overlapping nodes
    filtered_nodes = []
    for node in semantic_nodes:
        is_contained = False
        for existing in filtered_nodes:
            if (existing['start_byte'] <= node['start_byte'] and 
                existing['end_byte'] >= node['end_byte']):
                is_contained = True
                break
        
        if not is_contained:
            to_remove = []
            for i, existing in enumerate(filtered_nodes):
                if (node['start_byte'] <= existing['start_byte'] and 
                    node['end_byte'] >= existing['end_byte']):
                    to_remove.append(i)
            
            for i in reversed(to_remove):
                filtered_nodes.pop(i)
            
            filtered_nodes.append(node)
    
    return filtered_nodes

def extract_css_name(node, source_code: str) -> str:
    """Extract meaningful name from CSS/SCSS node"""
    import re
    node_text = source_code[node.start_byte:node.end_byte]
    first_line = node_text.split('\n')[0].strip()
    
    if node.type == 'rule_set':
        # Extract selector
        match = re.search(r'^([^{]+)', first_line)
        if match:
            selector = match.group(1).strip()
            return f"rule_{selector[:30].replace(' ', '_')}"
    elif node.type == 'at_rule':
        # Extract @rule name
        match = re.search(r'@(\w+)', first_line)
        if match:
            return f"at_{match.group(1)}"
    elif node.type == 'keyframes_statement':
        match = re.search(r'@keyframes\s+(\w+)', first_line)
        if match:
            return f"keyframes_{match.group(1)}"
    
    return f"{node.type}_{node.start_byte}"

def extract_html_name(node, source_code: str) -> str:
    """Extract meaningful name from HTML node"""
    import re
    node_text = source_code[node.start_byte:node.end_byte]
    first_line = node_text.split('\n')[0].strip()
    
    # Extract tag name and important attributes
    if node.type in ['element', 'script_element', 'style_element']:
        match = re.search(r'<(\w+)(?:\s+[^>]*)?>', first_line)
        if match:
            tag = match.group(1)
            # Look for id or class
            id_match = re.search(r'id=["\']([^"\']+)["\']', first_line)
            class_match = re.search(r'class=["\']([^"\']+)["\']', first_line)
            
            if id_match:
                return f"{tag}#{id_match.group(1)}"
            elif class_match:
                return f"{tag}.{class_match.group(1).split()[0]}"
            else:
                return tag
    
    return f"{node.type}_{node.start_byte}"

def create_semantic_chunks(semantic_nodes: List[Dict[str, Any]]) -> List[Chunk]:
    """Create chunks from semantic nodes"""
    chunks = []
    
    for node_info in semantic_nodes:
        chunk = Chunk(
            start_byte=node_info['start_byte'],
            end_byte=node_info['end_byte'],
            content=node_info['content'],
            node_type=node_info['type'],
            name=node_info['name'],
            depth=0
        )
        chunks.append(chunk)
    
    return chunks

def group_small_chunks(chunks: List[Chunk], target_tokens: int = 600, file_extension: str = '') -> List[Chunk]:
    """Group small chunks together to reach reasonable size for any file type"""
    if not chunks:
        return chunks
    
    # Separate imports from other chunks for JS/TS files
    if file_extension in ['.tsx', '.ts', '.jsx', '.js']:
        import_chunks = [c for c in chunks if c.node_type == 'import_statement']
        other_chunks = [c for c in chunks if c.node_type != 'import_statement']
        
        # Check if everything together is under limit
        total_tokens = sum(c.token_count for c in chunks)
        if total_tokens <= MAX_CHUNK_TOKENS:
            # Combine everything into one chunk
            combined_content = '\n\n'.join(c.content for c in chunks)
            combined_chunk = Chunk(
                start_byte=chunks[0].start_byte,
                end_byte=chunks[-1].end_byte,
                content=combined_content,
                node_type='complete_module',
                name=f"complete_module_{len(chunks)}_parts",
                depth=0
            )
            return [combined_chunk]
        
        # If too large, handle imports separately
        if import_chunks:
            total_import_tokens = sum(c.token_count for c in import_chunks)
            if total_import_tokens <= MAX_CHUNK_TOKENS:
                # Combine all imports into one chunk
                combined_imports = '\n'.join(c.content for c in import_chunks)
                imports_chunk = Chunk(
                    start_byte=import_chunks[0].start_byte,
                    end_byte=import_chunks[-1].end_byte,
                    content=combined_imports,
                    node_type='imports_group',
                    name=f"imports_{len(import_chunks)}_statements",
                    depth=0
                )
                import_chunks = [imports_chunk]
        
        # Group other chunks
        grouped_others = group_chunks_by_size(other_chunks, target_tokens, file_extension)
        
        # Combine imports + other chunks
        return import_chunks + grouped_others
    else:
        # For non-JS files, use original logic
        return group_chunks_by_size(chunks, target_tokens, file_extension)

def group_chunks_by_size(chunks: List[Chunk], target_tokens: int = 600, file_extension: str = '') -> List[Chunk]:
    """Group chunks by size logic"""
    if not chunks:
        return chunks
    
    # Skip grouping if we already have reasonably sized chunks
    if len(chunks) == 1 or any(c.token_count > target_tokens for c in chunks):
        total_tokens = sum(c.token_count for c in chunks)
        if total_tokens <= MAX_CHUNK_TOKENS:
            # All chunks together are still under limit - combine them
            if len(chunks) > 1:
                combined_content = '\n\n'.join(c.content for c in chunks)
                
                # Create descriptive name based on file type
                if file_extension in ['.css', '.scss']:
                    group_name = f"css_styles_{len(chunks)}_rules"
                elif file_extension in ['.tsx', '.ts', '.jsx', '.js']:
                    group_name = f"file_module_{len(chunks)}_exports"
                elif file_extension == '.html':
                    group_name = f"html_content_{len(chunks)}_elements"
                else:
                    group_name = f"file_content_{len(chunks)}_parts"
                
                combined_chunk = Chunk(
                    start_byte=chunks[0].start_byte,
                    end_byte=chunks[-1].end_byte,
                    content=combined_content,
                    node_type='grouped_content',
                    name=group_name,
                    depth=0
                )
                return [combined_chunk]
        return chunks
    
    # Group small chunks
    grouped_chunks = []
    current_group = []
    current_tokens = 0
    
    for chunk in chunks:
        if current_tokens + chunk.token_count > target_tokens and current_group:
            # Finalize current group
            group_content = '\n\n'.join(c.content for c in current_group)
            
            if file_extension in ['.css', '.scss']:
                group_name = f"css_group_{len(grouped_chunks)+1}_{len(current_group)}_rules"
            elif file_extension in ['.tsx', '.ts', '.jsx', '.js']:
                group_name = f"code_group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            elif file_extension == '.html':
                group_name = f"html_group_{len(grouped_chunks)+1}_{len(current_group)}_elements"
            else:
                group_name = f"group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            
            grouped_chunk = Chunk(
                start_byte=current_group[0].start_byte,
                end_byte=current_group[-1].end_byte,
                content=group_content,
                node_type='grouped_content',
                name=group_name,
                depth=0
            )
            grouped_chunks.append(grouped_chunk)
            
            current_group = [chunk]
            current_tokens = chunk.token_count
        else:
            current_group.append(chunk)
            current_tokens += chunk.token_count
    
    # Add remaining chunks
    if current_group:
        if len(current_group) == 1 and current_tokens > target_tokens//2:
            grouped_chunks.append(current_group[0])
        else:
            group_content = '\n\n'.join(c.content for c in current_group)
            
            if file_extension in ['.css', '.scss']:
                group_name = f"css_group_{len(grouped_chunks)+1}_{len(current_group)}_rules"
            elif file_extension in ['.tsx', '.ts', '.jsx', '.js']:
                group_name = f"code_group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            elif file_extension == '.html':
                group_name = f"html_group_{len(grouped_chunks)+1}_{len(current_group)}_elements"
            else:
                group_name = f"group_{len(grouped_chunks)+1}_{len(current_group)}_parts"
            
            grouped_chunk = Chunk(
                start_byte=current_group[0].start_byte,
                end_byte=current_group[-1].end_byte,
                content=group_content,
                node_type='grouped_content',
                name=group_name,
                depth=0
            )
            grouped_chunks.append(grouped_chunk)
    
    return grouped_chunks

def sub_chunk_by_statements(chunk: Chunk, tree, source_code: str, depth: int = 0) -> List[Chunk]:
    """Sub-chunk by breaking down into logical statements/blocks"""
    if depth >= MAX_RECURSION_DEPTH or chunk.token_count <= MAX_CHUNK_TOKENS:
        return [chunk]
    
    print(f"    Breaking down {chunk.name} ({chunk.token_count} tokens) into smaller pieces...")
    
    # Simple line-based splitting for now
    lines = chunk.content.split('\n')
    sub_chunks = []
    current_lines = []
    current_size = 0
    
    for line in lines:
        line_tokens = count_tokens(line)
        
        if current_size + line_tokens > MAX_CHUNK_TOKENS and current_lines:
            # Create sub-chunk
            sub_content = '\n'.join(current_lines)
            if sub_content.strip():
                sub_chunk = Chunk(
                    start_byte=chunk.start_byte,  # Approximate
                    end_byte=chunk.start_byte + len(sub_content),
                    content=sub_content,
                    node_type=f"{chunk.node_type}_part",
                    name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                    depth=depth + 1
                )
                sub_chunks.append(sub_chunk)
            
            current_lines = [line]
            current_size = line_tokens
        else:
            current_lines.append(line)
            current_size += line_tokens
    
    # Add remaining lines
    if current_lines:
        sub_content = '\n'.join(current_lines)
        if sub_content.strip():
            sub_chunk = Chunk(
                start_byte=chunk.start_byte,
                end_byte=chunk.end_byte,
                content=sub_content,
                node_type=f"{chunk.node_type}_part",
                name=f"{chunk.name}_part_{len(sub_chunks)+1}",
                depth=depth + 1
            )
            sub_chunks.append(sub_chunk)
    
    chunk.sub_chunks = sub_chunks
    return sub_chunks if len(sub_chunks) > 1 else [chunk]

def process_file(file_path: Path) -> List[Chunk]:
    """Process a single file and return semantic chunks"""
    print(f"\n=== Processing: {file_path.name} ===")
    
    # Read file
    with open(file_path, 'r', encoding='utf-8') as f:
        source_code = f.read()
    
    print(f"File size: {len(source_code)} characters")
    print(f"File type: {file_path.suffix}")
    
    # Get appropriate parser
    parser = PARSER_MAP.get(file_path.suffix)
    if not parser:
        print(f"❌ No parser available for {file_path.suffix}")
        return []
    
    # Parse with appropriate parser
    tree = parser.parse(source_code.encode('utf-8'))
    
    if tree.root_node.has_error:
        print("⚠️ Parse errors detected")
    
    # Find semantic chunks
    semantic_nodes = find_semantic_chunks(tree, source_code, file_path.suffix)
    print(f"Found {len(semantic_nodes)} semantic units")
    
    # Show what we found
    for node in semantic_nodes:
        preview = node['content'][:100].replace('\n', ' ').strip()
        print(f"  - {node['type']}: {node['name']} ({count_tokens(node['content'])} tokens)")
        print(f"    Preview: {preview}...")
    
    # Create chunks
    base_chunks = create_semantic_chunks(semantic_nodes)
    
    # Group small chunks for all file types
    base_chunks = group_small_chunks(base_chunks, target_tokens=600, file_extension=file_path.suffix)
    
    print(f"Created {len(base_chunks)} semantic chunks")
    
    # Apply sub-chunking for oversized chunks
    final_chunks = []
    oversized_count = 0
    
    for chunk in base_chunks:
        if chunk.token_count > MAX_CHUNK_TOKENS:
            print(f"  Sub-chunking {chunk.name} ({chunk.token_count} tokens)")
            sub_chunks = sub_chunk_by_statements(chunk, tree, source_code)
            final_chunks.extend(sub_chunks)
            oversized_count += 1
        else:
            final_chunks.append(chunk)
    
    if oversized_count > 0:
        print(f"Sub-chunked {oversized_count} oversized chunks")
    print(f"Final result: {len(final_chunks)} total chunks")
    
    return final_chunks

def generate_unique_id(length: int = 6) -> str:
    """Generate a random unique ID"""
    alphabet = string.ascii_lowercase + string.digits
    return ''.join(secrets.choice(alphabet) for _ in range(length))

def create_chunk_filename(original_filename: str, chunk_number: int, unique_id: str) -> str:
    """Create chunk filename: Accordion.tsx_chunk_001_a1s2d3.md"""
    return f"{original_filename}_chunk_{chunk_number:03d}_{unique_id}.md"

def get_markdown_language(file_extension: str) -> str:
    """Get markdown language for code blocks"""
    lang_map = {
        '.tsx': 'tsx',
        '.ts': 'typescript', 
        '.js': 'javascript',
        '.jsx': 'jsx',
        '.css': 'css',
        '.scss': 'scss',
        '.html': 'html'
    }
    return lang_map.get(file_extension, 'text')

def create_chunk_markdown(chunk: Chunk, source_file_path: str, file_extension: str) -> str:
    """Create markdown content with YAML frontmatter"""
    language = get_markdown_language(file_extension)
    
    markdown_content = f"""---
source_file: {source_file_path}
chunk_index: {getattr(chunk, 'index', 1)}
chunk_type: {chunk.node_type}
chunk_name: {chunk.name}
token_count: {chunk.token_count}
---

# {chunk.name}

```{language}
{chunk.content}
```
"""
    return markdown_content

def save_chunks_to_files(chunks: List[Chunk], 
                        original_file_path: Path, 
                        input_directory: Path,
                        output_base: Path) -> List[str]:
    """Save chunks as markdown files maintaining directory structure"""
    if not chunks:
        return []
    
    # Calculate relative path from input directory
    try:
        rel_path = original_file_path.relative_to(input_directory)
    except ValueError:
        # If file is not under input directory, use just the filename
        rel_path = original_file_path.name
    
    # Create output directory structure
    output_dir = output_base / rel_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Generate unique ID for this file
    file_unique_id = generate_unique_id()
    
    saved_files = []
    
    # Add chunk index to each chunk and save
    for i, chunk in enumerate(chunks, 1):
        chunk.index = i  # Add index attribute
        
        # Create chunk filename
        chunk_filename = create_chunk_filename(
            original_file_path.name, 
            i, 
            file_unique_id
        )
        
        # Create markdown content
        markdown_content = create_chunk_markdown(
            chunk, 
            str(rel_path), 
            original_file_path.suffix
        )
        
        # Write to file
        chunk_file_path = output_dir / chunk_filename
        try:
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            
            saved_files.append(str(chunk_file_path))
            print(f"  ✅ Saved: {chunk_filename}")
            
        except Exception as e:
            print(f"  ❌ Error saving {chunk_filename}: {e}")
    
    return saved_files
def print_chunk_summary(chunks: List[Chunk], file_name: str):
    """Print detailed summary of chunks"""
    print(f"\n--- Semantic Chunk Summary for {file_name} ---")
    
    for i, chunk in enumerate(chunks, 1):
        indent = "  " * chunk.depth
        content_lines = len(chunk.content.split('\n'))
        
        print(f"{indent}{i}. {chunk.name}")
        print(f"{indent}   Type: {chunk.node_type}")
        print(f"{indent}   Size: {chunk.token_count} tokens, {content_lines} lines")
        print(f"{indent}   Content preview:")
        
        # Show first few lines of actual content
        content_lines_list = chunk.content.split('\n')
        for j, line in enumerate(content_lines_list[:3]):
            print(f"{indent}     {line.strip()}")
        if len(content_lines_list) > 3:
            print(f"{indent}     ... ({len(content_lines_list) - 3} more lines)")
        print()

def main():
    """Main function to test semantic chunking"""
    print("🚀 Multi-Language Semantic Chunking Test")
    print(f"Max chunk tokens: {MAX_CHUNK_TOKENS}")
    print(f"Max recursion depth: {MAX_RECURSION_DEPTH}")
    print(f"Supported extensions: {', '.join(SUPPORTED_EXTENSIONS)}")
    
    # Get directory from user or use current directory
    directory = input("\nEnter directory path (or press Enter for current directory): ").strip()
    if not directory:
        directory = "."
    
    target_dir = Path(directory)
    if not target_dir.exists():
        print(f"❌ Directory not found: {directory}")
        return
    
    # Find all supported files
    all_files = []
    for ext in SUPPORTED_EXTENSIONS:
        files = list(target_dir.rglob(f"*{ext}"))
        all_files.extend(files)
    
    if not all_files:
        print(f"❌ No supported files found in {directory}")
        print(f"Looking for: {', '.join(SUPPORTED_EXTENSIONS)}")
        return
    
    print(f"📁 Found {len(all_files)} supported files:")
    # Group by extension for summary
    by_ext = {}
    by_dir = {}
    for f in all_files:
        ext = f.suffix
        by_ext[ext] = by_ext.get(ext, 0) + 1
        
        # Track directory depth
        rel_path = f.relative_to(target_dir)
        dir_path = str(rel_path.parent) if rel_path.parent != Path('.') else 'root'
        by_dir[dir_path] = by_dir.get(dir_path, 0) + 1
    
    for ext, count in sorted(by_ext.items()):
        print(f"  {ext}: {count} files")
    
    # Show directory structure if there are subdirectories
    if len(by_dir) > 1 or 'root' not in by_dir:
        print(f"\n📂 Directory distribution:")
        for dir_path, count in sorted(by_dir.items()):
            if dir_path == 'root':
                print(f"  ./: {count} files")
            else:
                depth = dir_path.count('/') if '/' in dir_path else dir_path.count('\\')
                indent = "  " + "  " * depth
                print(f"{indent}{dir_path}/: {count} files")
    
    # Process each file
    all_chunks = []
    all_saved_files = []
    
    # Create output directory
    output_directory = target_dir.parent / f"{target_dir.name}_chunks"
    print(f"\n📁 Output directory: {output_directory}")
    
    for file_path in all_files:
        try:
            chunks = process_file(file_path)
            all_chunks.extend(chunks)
            print_chunk_summary(chunks, file_path.name)
            
            # Save chunks to files
            saved_files = save_chunks_to_files(chunks, file_path, target_dir, output_directory)
            all_saved_files.extend(saved_files)
            
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    # Overall summary
    print(f"\n🎉 OVERALL SUMMARY")
    print(f"Files processed: {len(all_files)}")
    print(f"Total chunks created: {len(all_chunks)}")
    print(f"Total chunk files saved: {len(all_saved_files)}")
    print(f"Output directory: {output_directory}")
    
    if all_chunks:
        avg_tokens = sum(c.token_count for c in all_chunks) / len(all_chunks)
        max_tokens = max(c.token_count for c in all_chunks)
        min_tokens = min(c.token_count for c in all_chunks)
        
        print(f"Average chunk size: {avg_tokens:.0f} tokens")
        print(f"Largest chunk: {max_tokens} tokens")
        print(f"Smallest chunk: {min_tokens} tokens")
        
        # Count by node type
        type_counts = {}
        for chunk in all_chunks:
            type_counts[chunk.node_type] = type_counts.get(chunk.node_type, 0) + 1
        
        print(f"\nChunk types:")
        for node_type, count in sorted(type_counts.items()):
            print(f"  {node_type}: {count}")
            
        # Count by file type
        print(f"\nChunks by file type:")
        file_type_chunks = {}
        # We need to track file types in chunks - simplified approach
        for ext, count in sorted(by_ext.items()):
            # Estimate chunks per file type based on proportions
            proportion = count / len(all_files)
            estimated_chunks = int(len(all_chunks) * proportion)
            file_type_chunks[ext] = estimated_chunks
            print(f"  {ext}: ~{estimated_chunks} chunks")

if __name__ == "__main__":
    main()

✅ Using TSX, CSS, and HTML parsers
🚀 Multi-Language Semantic Chunking Test
Max chunk tokens: 1000
Max recursion depth: 3
Supported extensions: .tsx, .ts, .js, .jsx, .css, .scss, .html
📁 Found 358 supported files:
  .css: 93 files
  .ts: 129 files
  .tsx: 136 files

📂 Directory distribution:
  src/: 3 files
    src/accordion/: 10 files
    src/aria-announcer/: 5 files
    src/avatar/: 4 files
    src/badge/: 3 files
    src/banner/: 7 files
    src/border-item/: 3 files
    src/border-layout/: 2 files
    src/breakpoints/: 3 files
    src/button/: 4 files
    src/card/: 3 files
    src/checkbox/: 7 files
      src/checkbox/internal/: 2 files
    src/combo-box/: 4 files
    src/dialog/: 12 files
    src/divider/: 3 files
    src/drawer/: 5 files
    src/dropdown/: 3 files
    src/file-drop-zone/: 5 files
      src/file-drop-zone/internal/: 1 files
    src/flex-item/: 3 files
    src/flex-layout/: 4 files
    src/flow-layout/: 2 files
    src/form-field/: 7 files
    src/form-field-contex