# Parser and Chunker for financial related PDFS

In [1]:
%load_ext autoreload
%autoreload 2

# Import necessary modules
import sys
from pathlib import Path
import logging

# Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Configure logging with INFO prefix
logging.basicConfig(
    level=logging.INFO,
    format='INFO %(message)s'
)
logger = logging.getLogger(__name__)


import pdfplumber
import pandas as pd
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any
import re
from dataclasses import dataclass
import json

@dataclass
class DocumentChunk:
    content: str
    chunk_type: str  # 'text', 'table', 'financial_statement'
    metadata: Dict[str, Any]
    page_number: int
    bbox: tuple  # bounding box coordinates

class KleisterCharityParser:
    def __init__(self):
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Financial patterns for charity documents
        self.financial_patterns = {
            'income': r'(?:total\s+)?income[s]?\s*[:£]?\s*(\d{1,3}(?:,\d{3})*)',
            'expenditure': r'(?:total\s+)?(?:expenditure|spending|expenses?)\s*[:£]?\s*(\d{1,3}(?:,\d{3})*)',
            'assets': r'(?:total\s+)?assets?\s*[:£]?\s*(\d{1,3}(?:,\d{3})*)',
            'charity_number': r'(?:charity\s+)?(?:registration\s+)?(?:number|no\.?)\s*[:]\s*(\d{6,8})',
        }
        
        # Document section patterns
        self.section_patterns = {
            'trustees_report': r'(?:trustees?\s*report|report\s+of\s+the\s+trustees)',
            'financial_statements': r'(?:financial\s+statements?|statement\s+of\s+financial)',
            'income_statement': r'(?:statement\s+of\s+financial\s+activities|income\s+and\s+expenditure)',
            'balance_sheet': r'(?:balance\s+sheet|statement\s+of\s+financial\s+position)',
            'cash_flow': r'(?:cash\s+flow\s+statement|statement\s+of\s+cash\s+flows)',
            'notes': r'(?:notes?\s+to\s+the\s+financial\s+statements?)'
        }

    def parse_document(self, pdf_path: str) -> List[DocumentChunk]:
        """Parse charity PDF into structured chunks"""
        chunks = []
        
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):

           
                logger.info("Page %d", page_num)

                # Extract text and tables
                page_text = page.extract_text()
                # tables = page.extract_tables()
                tables = self._extract_tables_robust(page)
                
                if page_text:
                    logger.info("└─ Text found")

                    # Section-aware text chunking
                    text_chunks = self._chunk_text_by_sections(
                        page_text, page_num
                    )
                    chunks.extend(text_chunks)
                
                # Process tables separately
                for table_idx, table in enumerate(tables):

                    if table:
                        logger.info("Table detected!")
                        table_chunk = self._process_table(
                            table, page_num, table_idx, page.bbox
                        )
                        if table_chunk:
                            chunks.append(table_chunk)
        
        return chunks

    def _chunk_text_by_sections(self, text: str, page_num: int) -> List[DocumentChunk]:
        """Chunk text based on document structure and semantic similarity"""
        chunks = []
        
        # 1. Identify document sections
        current_section = self._identify_section(text)

        logger.info(f"Current Section, {current_section}")
        
        # 2. Split into paragraphs, preserving structure
        # Creates a list of strings
        paragraphs = self._split_into_paragraphs(text)

        logger.info(f"Paragraphs, {len(paragraphs)}")
        
        # 3. Group paragraphs into chunks (semantic + size-based)
        current_chunk = ""
        current_entities = {}
        
        for para in paragraphs:
            # Check if adding this paragraph exceeds token limit
            potential_chunk = current_chunk + "\n" + para
            
            if self._count_tokens(potential_chunk) > 800 or self._is_section_break(para):
                if current_chunk.strip():
                    # Create chunk with extracted entities
                    entities = self._extract_entities(current_chunk)
                    
                    chunks.append(DocumentChunk(
                        content=current_chunk.strip(),
                        chunk_type='text',
                        metadata={
                            'section': current_section,
                            'entities': entities,
                            'document_type': 'charity_report'
                        },
                        page_number=page_num,
                        bbox=(0, 0, 0, 0)  # Full page for text
                    ))
                
                current_chunk = para
                current_section = self._identify_section(para)
            else:
                current_chunk = potential_chunk
        
        # Add final chunk
        if current_chunk.strip():
            entities = self._extract_entities(current_chunk)
            chunks.append(DocumentChunk(
                content=current_chunk.strip(),
                chunk_type='text',
                metadata={
                    'section': current_section,
                    'entities': entities,
                    'document_type': 'charity_report'
                },
                page_number=page_num,
                bbox=(0, 0, 0, 0)
            ))
        
        return chunks

    def _process_table(self, table: List[List], page_num: int, 
                      table_idx: int, page_bbox: tuple) -> DocumentChunk:
        """Process financial tables with structure preservation"""
        
        if not table or len(table) < 2:
            return None
            
        # Convert to DataFrame for easier processing
        df = pd.DataFrame(table[1:], columns=table[0])
        
        # Clean the table
        df = df.dropna(how='all').fillna('')
        
        # Identify table type
        table_type = self._classify_table(df)
        
        # Extract financial data
        financial_data = self._extract_financial_data_from_table(df)
        
        # Create structured representation
        table_content = {
            'table_type': table_type,
            'headers': table[0] if table else [],
            'data': df.to_dict('records'),
            'financial_summary': financial_data
        }
        
        # Create natural language description for vector search
        nl_description = self._table_to_natural_language(df, table_type)
        
        return DocumentChunk(
            content=nl_description,
            chunk_type='table',
            metadata={
                'table_data': table_content,
                'table_type': table_type,
                'financial_entities': financial_data,
                'structured_data': True
            },
            page_number=page_num,
            bbox=page_bbox
        )

    def _extract_tables_robust(self, page) -> List[List[List]]:
        """Try multiple table extraction strategies"""
        tables = []
        
        # Strategy 1: Default pdfplumber extraction
        default_tables = page.extract_tables()
        if default_tables:
            tables.extend([t for t in default_tables if t])
        
        # Strategy 2: More aggressive table settings
        try:
            aggressive_tables = page.extract_tables({
                "vertical_strategy": "lines_strict",
                "horizontal_strategy": "lines_strict",
                "intersection_tolerance": 3,
            })
            if aggressive_tables:
                # Add tables that weren't found by default method
                for table in aggressive_tables:
                    if table and table not in tables:
                        tables.append(table)
        except:
            pass
        
        # Strategy 3: Text-based table detection for space-separated data
        try:
            text_tables = self._detect_text_tables(page)
            tables.extend(text_tables)
        except:
            pass
        
        return tables

    def _detect_text_tables(self, page) -> List[List[List]]:
        """Detect tables in text-based layouts (space-separated columns)"""
        text = page.extract_text()
        if not text:
            return []
        
        lines = text.split('\n')
        tables = []
        current_table = []
        
        for line in lines:
            # Look for lines that look like table rows
            # Multiple numbers/currency amounts suggest a table row
            if self._looks_like_table_row(line):
                # Split by multiple spaces (column separator)
                columns = re.split(r'\s{3,}', line.strip())
                if len(columns) >= 2:  # At least 2 columns
                    current_table.append(columns)
            else:
                # End of table
                if len(current_table) >= 2:  # At least 2 rows
                    tables.append(current_table)
                current_table = []
        
        # Don't forget the last table
        if len(current_table) >= 2:
            tables.append(current_table)
        
        return tables

    def _looks_like_table_row(self, line: str) -> bool:
        """Check if a line looks like a table row"""
        # Look for patterns that suggest tabular data
        indicators = [
            len(re.findall(r'£\d+', line)) >= 2,  # Multiple currency amounts
            len(re.findall(r'\d+%', line)) >= 1,  # Percentages
            len(re.findall(r'\d{1,3}(?:,\d{3})*', line)) >= 3,  # Multiple large numbers
            bool(re.search(r'\s{5,}', line))  # Wide spacing (column gaps)
        ]
        
        return sum(indicators) >= 2

    def _extract_entities(self, text: str) -> Dict[str, Any]:
        """Extract financial entities using regex patterns"""
        entities = {}
        
        for entity_type, pattern in self.financial_patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                entities[entity_type] = matches
        
        return entities

    def _identify_section(self, text: str) -> str:
        """Identify document section type"""
        text_lower = text.lower()
        
        for section, pattern in self.section_patterns.items():
            if re.search(pattern, text_lower):
                return section
        
        return 'general'

    def _classify_table(self, df: pd.DataFrame) -> str:
        """Classify table type based on headers and content"""
        headers = [str(col).lower() for col in df.columns]
        
        if any('income' in h or 'revenue' in h for h in headers):
            return 'income_statement'
        elif any('asset' in h or 'liability' in h for h in headers):
            return 'balance_sheet'
        elif any('cash' in h or 'flow' in h for h in headers):
            return 'cash_flow'
        else:
            return 'general_financial'

    def _extract_financial_data_from_table(self, df: pd.DataFrame) -> Dict[str, float]:
        """Extract key financial metrics from table"""
        financial_data = {}
        
        # Look for numeric columns
        numeric_cols = df.select_dtypes(include=['number']).columns
        
        for col in numeric_cols:
            values = df[col].dropna()
            if not values.empty:
                financial_data[f"{col}_total"] = float(values.sum())
                financial_data[f"{col}_max"] = float(values.max())
        
        return financial_data

    def _table_to_natural_language(self, df: pd.DataFrame, table_type: str) -> str:
        """Convert table to natural language description for vector search"""
        
        description = f"This is a {table_type} table with the following information:\n"
        
        # Add column headers
        description += f"Columns: {', '.join(df.columns)}\n"
        
        # Add key insights
        numeric_cols = df.select_dtypes(include=['number']).columns
        for col in numeric_cols[:3]:  # Limit to top 3 numeric columns
            total = df[col].sum()
            description += f"Total {col}: £{total:,.2f}\n"
        
        # Add first few rows as examples
        description += "Sample data:\n"
        for i, row in df.head(3).iterrows():
            row_desc = ", ".join([f"{col}: {val}" for col, val in row.items() if str(val).strip()])
            description += f"- {row_desc}\n"
        
        return description


    def _clean_pdf_text(self, text: str) -> str:
        """Clean PDF text formatting issues"""
        
        # Step 1: Fix broken words across lines
        # Look for lowercase letter + newline + lowercase letter (likely broken word)
        text = re.sub(r'([a-z])\n([a-z])', r'\1\2', text)
        
        # Step 2: Fix missing spaces after periods/punctuation
        text = re.sub(r'([.!?])\n([A-Z])', r'\1 \2', text)
        
        # Step 3: Join lines that are clearly continuation of sentences
        # (lowercase after newline suggests continuation)
        text = re.sub(r'\n([a-z])', r' \1', text)
        
        # Step 4: Preserve actual paragraph breaks
        # Keep newlines before capital letters that start new sentences
        text = re.sub(r'\n([A-Z][a-z])', r'\n\n\1', text)
        
        # Step 5: Clean up excessive whitespace
        text = re.sub(r'\n{3,}', '\n\n', text)  # Max 2 consecutive newlines
        text = re.sub(r' {2,}', ' ', text)      # Max 1 space between words
        
        return text


    
    def _smart_paragraph_split(self, paragraph: str) -> List[str]:
        """Split overly long paragraphs using semantic cues"""
        
        if len(paragraph) < 500:  # Short enough already
            return [paragraph]
        
        # Look for natural break points
        sentences = re.split(r'([.!?]+\s+)', paragraph)
        
        paragraphs = []
        current_para = ""
        
        for i in range(0, len(sentences), 2):  # Step by 2 (sentence + delimiter)
            sentence = sentences[i]
            delimiter = sentences[i + 1] if i + 1 < len(sentences) else ""
            
            # Check if this sentence suggests a topic change
            topic_change_indicators = [
                r'^(However|Furthermore|Moreover|Additionally|In contrast)',
                r'^(The charity|The company|The organization)',
                r'^(During the year|In \d{4}|For the period)',
                r'^[A-Z][a-z]+ (activities|performance|report|statement)'
            ]
            
            is_topic_change = any(re.search(pattern, sentence.strip(), re.IGNORECASE) 
                                for pattern in topic_change_indicators)
            
            if is_topic_change and current_para and len(current_para) > 200:
                # Start new paragraph
                paragraphs.append(current_para.strip())
                current_para = sentence + delimiter
            else:
                current_para += sentence + delimiter
        
        # Add the final paragraph
        if current_para.strip():
            paragraphs.append(current_para.strip())
        
        return paragraphs

    # def _split_into_paragraphs(self, text: str) -> List[str]:
    #     """Split text into paragraphs while preserving structure"""
    #     # Split by double newlines, but also by section headers
    #     paragraphs = re.split(r'\n\s*\n', text)
    #     return [p.strip() for p in paragraphs if p.strip()]


    def _split_into_paragraphs(self, text: str) -> List[str]:
        """Split text into paragraphs while preserving structure and cleaning formatting"""
        
        # Step 1: Clean up the text formatting
        cleaned_text = self._clean_pdf_text(text)
        
        # Step 2: Split into paragraphs using multiple strategies
        paragraphs = []
        
        # Strategy 1: Split by actual paragraph breaks (double newlines)
        by_double_newline = re.split(r'\n\s*\n', cleaned_text)
        
        # Strategy 2: Split by sentence patterns + indentation
        for para in by_double_newline:
            # Further split long paragraphs that might contain multiple logical paragraphs
            sub_paragraphs = self._smart_paragraph_split(para.strip())
            paragraphs.extend(sub_paragraphs)
        
        # Filter out very short paragraphs (likely formatting artifacts)
        return [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 20]

    def _count_tokens(self, text: str) -> int:
        """Approximate token count"""
        return len(text.split())

    def _is_section_break(self, paragraph: str) -> bool:
        """Check if paragraph indicates a section break"""
        return any(re.search(pattern, paragraph, re.IGNORECASE) 
                  for pattern in self.section_patterns.values())

# Usage example
def process_kleister_dataset():
    parser = KleisterCharityParser()
    
    pdf_path = "../knowledge_base/data/raw/kleister_charity_pdfs/0d45add2d94d80a0eb85e41e22aa43a0.pdf"

    # Process a single document
    chunks = parser.parse_document(pdf_path)
    
    # Convert to format suitable for vector DB
    vector_ready_chunks = []
    for chunk in chunks:
        vector_ready_chunks.append({
            'content': chunk.content,
            'metadata': {
                **chunk.metadata,
                'page_number': chunk.page_number,
                'chunk_type': chunk.chunk_type
            },
            'embedding': None  # Will be generated by your vector DB
        })
    
    return vector_ready_chunks



vector_ready_chunks = process_kleister_dataset()

INFO Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO Use pytorch device: cpu
INFO Page 0
INFO └─ Text found
INFO Current Section, financial_statements
INFO Paragraphs, 2
INFO Page 1
INFO └─ Text found
INFO Current Section, financial_statements
INFO Paragraphs, 5
INFO Page 2
INFO └─ Text found
INFO Current Section, general
INFO Paragraphs, 17
INFO Page 3
INFO └─ Text found
INFO Current Section, general
INFO Paragraphs, 17
INFO Page 4
INFO └─ Text found
INFO Current Section, general
INFO Paragraphs, 4
INFO Page 5
INFO └─ Text found
INFO Current Section, general
INFO Paragraphs, 7
INFO Page 6
INFO └─ Text found
INFO Current Section, general
INFO Paragraphs, 4
INFO Page 7
INFO └─ Text found
INFO Current Section, general
INFO Paragraphs, 6
INFO Page 8
INFO └─ Text found
INFO Current Section, general
INFO Paragraphs, 11
INFO Page 9
INFO └─ Text found
INFO Current Section, financial_statements
INFO Paragraphs, 10
INFO Page 10
INFO └─ Text found
INFO Current Section, financial_state

In [None]:
%whos


Variable                   Type                    Data/Info
------------------------------------------------------------
Any                        _SpecialForm            typing.Any
Dict                       _SpecialGenericAlias    typing.Dict
DocumentChunk              type                    <class '__main__.DocumentChunk'>
KleisterCharityParser      type                    <class '__main__.KleisterCharityParser'>
List                       _SpecialGenericAlias    typing.List
Path                       type                    <class 'pathlib.Path'>
SentenceTransformer        type                    <class 'sentence_transfor<...>mer.SentenceTransformer'>
dataclass                  function                <function dataclass at 0x103a68ee0>
json                       module                  <module 'json' from '/Lib<...>hon3.9/json/__init__.py'>
pd                         module                  <module 'pandas' from '/U<...>ages/pandas/__init__.py'>
pdfplumber                 modul

In [7]:
# Helper function to inspect variables
from IPython.display import display
import pandas as pd

def inspect_variable(var):
    """Display detailed information about a variable"""
    if isinstance(var, list):
        print(f"List with {len(var)} items")
        if len(var) > 0:
            print("\nFirst item type:", type(var[0]).__name__)
            print("\nFirst 3 items:")
            for i, item in enumerate(var[:3]):
                print(f"\n[{i}]:", item)
    elif isinstance(var, dict):
        print(f"Dictionary with {len(var)} keys")
        print("\nKeys:", list(var.keys()))
    elif isinstance(var, pd.DataFrame):
        print("DataFrame Summary:")
        print("\nShape:", var.shape)
        print("\nColumns:", var.columns.tolist())
        display(var.head())
    else:
        print("Type:", type(var).__name__)
        print("\nValue:", var)

# Example usage:
inspect_variable(vector_ready_chunks)


List with 27 items

First item type: dict

First 3 items:

[0]: {'content': "Registered number: 02736320\nCharity number: 5013557\nACTION ON PRE-ECLAMPSIA LIMITED\n(A company limited by guarantee)\nUNAUDITED\nTRUSTEES' REPORT AND FINANCIAL STATEMENTS\nFOR THE YEAR ENDED 31 DECEMBER 2016", 'metadata': {'section': 'financial_statements', 'entities': {'charity_number': ['02736320', '5013557']}, 'document_type': 'charity_report', 'page_number': 0, 'chunk_type': 'text'}, 'embedding': None}

[1]: {'content': "ACTION ON PRE-ECLAMPSIA LIMITED\n(A company limited by guarantee)\nCONTENTS\nPage\nReference and administrative details ofthe charity, its trustees and advisers\nTrustees' report 2-9\nIndependent examiner's report 10-11\nStatement offinancial activities 12\nBalance sheet\n13\nNotes to the financial statements 14-25", 'metadata': {'section': 'financial_statements', 'entities': {}, 'document_type': 'charity_report', 'page_number': 1, 'chunk_type': 'text'}, 'embedding': None}

[2]: {'conte

In [4]:
print(type(vector_ready_chunks))
print(len(vector_ready_chunks))
print(vector_ready_chunks[3])

for chunk in vector_ready_chunks:
    print(chunk)

<class 'list'>
48
{'content': 'Notes to the financial statements 14-25', 'metadata': {'section': 'financial_statements', 'entities': {}, 'document_type': 'charity_report', 'page_number': 1, 'chunk_type': 'text'}, 'embedding': None}
{'content': 'Registered number: 02736320', 'metadata': {'section': 'financial_statements', 'entities': {'charity_number': ['02736320']}, 'document_type': 'charity_report', 'page_number': 0, 'chunk_type': 'text'}, 'embedding': None}
{'content': "Charity number: 5013557\nACTION ON PRE-ECLAMPSIA LIMITED\n(A company limited by guarantee)\nUNAUDITED\nTRUSTEES' REPORT AND FINANCIAL STATEMENTS\nFOR THE YEAR ENDED 31 DECEMBER 2016", 'metadata': {'section': 'financial_statements', 'entities': {'charity_number': ['5013557']}, 'document_type': 'charity_report', 'page_number': 0, 'chunk_type': 'text'}, 'embedding': None}
{'content': "ACTION ON PRE-ECLAMPSIA LIMITED\n(A company limited by guarantee)\nCONTENTS\nReference and administrative details ofthe charity, its trust