In [1]:
import re
from typing import List, Dict, Optional
from dataclasses import dataclass
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pymupdf  # PyMuPDF for PDF processing

In [2]:
@dataclass
class TableChunk:
    """Represents a database table chunk with metadata"""
    table_name: str
    content: str
    page_numbers: List[int]
    section_type: str  # 'table_description', 'overview', 'relationships'

class NorthwindSchemaChunker:
    """
    Splits the Northwind Database Schema PDF into semantic chunks
    focused on individual database tables for RAG applications.
    """
    
    def __init__(self):
        # Patterns to identify table sections
        self.table_patterns = {
            'table_header': re.compile(r'^([A-Z][a-zA-Z_]+)\s*$', re.MULTILINE),
            'description_start': re.compile(r'Description:\s*', re.IGNORECASE),
            'columns_start': re.compile(r'Columns:\s*', re.IGNORECASE),
            'primary_key': re.compile(r'Primary Key:\s*', re.IGNORECASE),
            'foreign_keys': re.compile(r'Foreign Keys:\s*', re.IGNORECASE),
            'relationships': re.compile(r'Relationships:\s*', re.IGNORECASE)
        }
        
        # Known table names from the schema
        self.known_tables = {
            'Orders', 'Order_Details', 'Customers', 'Employees', 'Products',
            'Categories', 'Suppliers', 'Shippers', 'Regions', 'Territories',
            'EmployeeTerritories', 'CustomerDemographics', 'CustomerCustomerDemo',
            'US_States'
        }
    
    def extract_text_from_pdf(self, pdf_path: str) -> List[Dict]:
        """Extract text from PDF with page information"""
        doc = pymupdf.open(pdf_path)
        pages_content = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            pages_content.append({
                'page_number': page_num + 1,
                'content': text
            })
        
        doc.close()
        return pages_content
    
    def identify_table_sections(self, text: str) -> List[Dict]:
        """Identify and extract table sections from the text"""
        sections = []
        
        # Split text into potential sections based on table names
        lines = text.split('\n')
        current_section = []
        current_table = None
        in_table_section = False
        
        for i, line in enumerate(lines):
            line = line.strip()
            
            # Check if this line is a table header
            if line in self.known_tables and (
                i == 0 or 
                lines[i-1].strip() == '' or 
                any(keyword in lines[i-1] for keyword in ['relationships', 'table.', 'table)'])
            ):
                # Save previous section if it exists
                if current_section and current_table:
                    sections.append({
                        'table_name': current_table,
                        'content': '\n'.join(current_section).strip()
                    })
                
                # Start new section
                current_table = line
                current_section = [line]
                in_table_section = True
                
            elif in_table_section:
                current_section.append(line)
                
                # Check if we've reached the end of this table section
                # (next table name or specific end markers)
                next_lines = lines[i+1:i+3] if i+1 < len(lines) else []
                if any(next_line.strip() in self.known_tables for next_line in next_lines):
                    # We're about to hit another table
                    continue
        
        # Don't forget the last section
        if current_section and current_table:
            sections.append({
                'table_name': current_table,
                'content': '\n'.join(current_section).strip()
            })
        
        return sections
    
    def create_overview_chunk(self, pages_content: List[Dict]) -> Document:
        """Create an overview chunk from the first page"""
        overview_text = ""
        overview_pages = []
        
        for page in pages_content[:2]:  # First two pages typically contain overview
            if any(keyword in page['content'].lower() for keyword in 
                   ['overview', 'entity-relationship', 'northwind database schema']):
                overview_text += page['content'] + "\n\n"
                overview_pages.append(page['page_number'])
        
        # Clean up the overview text
        overview_text = re.sub(r'\n{3,}', '\n\n', overview_text)
        
        return Document(
            page_content=overview_text.strip(),
            metadata={
                'section_type': 'overview',
                'table_name': 'schema_overview',
                'pages': overview_pages,
                'chunk_type': 'database_schema_overview'
            }
        )
    
    def create_table_chunks(self, pages_content: List[Dict]) -> List[Document]:
        """Create individual chunks for each database table"""
        chunks = []
        
        # Combine all text to process as one document
        full_text = ""
        page_mapping = {}
        char_count = 0
        
        for page in pages_content:
            page_start = char_count
            page_text = page['content']
            full_text += page_text + "\n\n"
            char_count += len(page_text) + 2
            page_mapping[page['page_number']] = (page_start, char_count)
        
        # Extract table sections using improved pattern matching
        table_sections = self.extract_table_sections_advanced(full_text, pages_content)
        
        for section in table_sections:
            # Clean and format the content
            content = self.clean_table_content(section['content'])
            
            # Determine which pages this content spans
            section_pages = self.find_content_pages(section['content'], pages_content)
            
            chunk = Document(
                page_content=content,
                metadata={
                    'section_type': 'table_description',
                    'table_name': section['table_name'].lower(),
                    'pages': section_pages,
                    'chunk_type': 'database_table',
                    'table_name_display': section['table_name']
                }
            )
            chunks.append(chunk)
        
        return chunks
    
    def extract_table_sections_advanced(self, full_text: str, pages_content: List[Dict]) -> List[Dict]:
        """Advanced extraction of table sections with better boundary detection"""
        sections = []
        
        # Pattern to find table headers followed by descriptions
        table_pattern = re.compile(
            r'\n(' + '|'.join(self.known_tables) + r')\s*\n.*?Description:\s*(.*?)(?=\n(?:' + 
            '|'.join(self.known_tables) + r')\s*\n|\nSummary of Table Relationships|\Z)',
            re.DOTALL | re.IGNORECASE
        )
        
        matches = table_pattern.finditer(full_text)
        
        for match in matches:
            table_name = match.group(1)
            full_section = match.group(0)
            
            sections.append({
                'table_name': table_name,
                'content': full_section.strip()
            })
        
        # If regex approach doesn't work well, fall back to line-by-line parsing
        if not sections:
            sections = self.identify_table_sections(full_text)
        
        return sections
    
    def clean_table_content(self, content: str) -> str:
        """Clean and format table content for better RAG performance"""
        # Remove excessive whitespace
        content = re.sub(r'\n{3,}', '\n\n', content)
        content = re.sub(r' {2,}', ' ', content)
        
        # Ensure proper formatting for key sections
        content = re.sub(r'(Description:|Columns:|Primary Key:|Foreign Keys:|Relationships:)', 
                        r'\n\1', content)
        
        # Clean up bullet points and formatting
        content = re.sub(r'^\s*[•·]\s*', '- ', content, flags=re.MULTILINE)
        
        return content.strip()
    
    def find_content_pages(self, content: str, pages_content: List[Dict]) -> List[int]:
        """Find which pages contain the given content"""
        pages = []
        content_words = set(content.lower().split()[:20])  # First 20 words for matching
        
        for page in pages_content:
            page_words = set(page['content'].lower().split())
            # If significant overlap, this page contains part of the content
            if len(content_words.intersection(page_words)) > len(content_words) * 0.3:
                pages.append(page['page_number'])
        
        return pages
    
    def create_relationships_chunk(self, pages_content: List[Dict]) -> Optional[Document]:
        """Create a chunk for the relationships summary section"""
        for page in pages_content:
            if 'Summary of Table Relationships' in page['content']:
                # Extract the relationships section
                content = page['content']
                start_idx = content.find('Summary of Table Relationships')
                if start_idx != -1:
                    relationships_content = content[start_idx:]
                    
                    return Document(
                        page_content=relationships_content,
                        metadata={
                            'section_type': 'relationships_summary',
                            'table_name': 'all_tables',
                            'pages': [page['page_number']],
                            'chunk_type': 'database_relationships'
                        }
                    )
        return None
    
    def chunk_pdf(self, pdf_path: str) -> List[Document]:
        """Main method to chunk the PDF into table-focused documents"""
        # Extract text from PDF
        pages_content = self.extract_text_from_pdf(pdf_path)
        
        chunks = []
        
        # Create overview chunk
        overview_chunk = self.create_overview_chunk(pages_content)
        chunks.append(overview_chunk)
        
        # Create individual table chunks
        table_chunks = self.create_table_chunks(pages_content)
        chunks.extend(table_chunks)
        
        # Create relationships summary chunk
        relationships_chunk = self.create_relationships_chunk(pages_content)
        if relationships_chunk:
            chunks.append(relationships_chunk)
        
        return chunks

# Usage example and helper functions
def prepare_chunks_for_vector_store(chunks: List[Document], 
                                   max_chunk_size: int = 2000) -> List[Document]:
    """
    Prepare chunks for vector store by ensuring they're not too large
    and have consistent metadata
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_size,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    final_chunks = []
    
    for chunk in chunks:
        if len(chunk.page_content) <= max_chunk_size:
            final_chunks.append(chunk)
        else:
            # Split large chunks while preserving metadata
            sub_chunks = text_splitter.split_documents([chunk])
            for i, sub_chunk in enumerate(sub_chunks):
                sub_chunk.metadata.update(chunk.metadata)
                sub_chunk.metadata['sub_chunk'] = i
                final_chunks.append(sub_chunk)
    
    return final_chunks

def main(pdf_path: str) -> List[Document]:
    """
    Main function to process the Northwind schema PDF
    
    Args:
        pdf_path: Path to the Northwind schema PDF file
    
    Returns:
        List of Document objects ready for vector database ingestion
    """
    chunker = NorthwindSchemaChunker()
    
    # Extract table-focused chunks
    chunks = chunker.chunk_pdf(pdf_path)
    
    # Prepare for vector store (ensure reasonable chunk sizes)
    final_chunks = prepare_chunks_for_vector_store(chunks)
    
    # Print summary
    print(f"Created {len(final_chunks)} chunks:")
    for chunk in final_chunks:
        print(f"- {chunk.metadata['table_name']} ({chunk.metadata['section_type']}) "
              f"- {len(chunk.page_content)} chars")
    
    return final_chunks


In [3]:
# Basic usage
chunker = NorthwindSchemaChunker()
chunks = chunker.chunk_pdf("data/Northwind_Traders_Database_Overview.pdf")

# Prepare for vector store
final_chunks = prepare_chunks_for_vector_store(chunks, max_chunk_size=2000)

In [4]:
len(final_chunks)

2