# Document Heading Extraction Improvements

This notebook focuses on improving the extraction of headings from various document types, with special attention to first page headings that are often missed due to different formatting or structural characteristics. We'll enhance the existing document analyzer to better identify and extract headings.

## 1. Import Required Libraries
First, let's import all the necessary libraries for document processing and heading extraction.

In [None]:
import os
import re
import json
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Any, Optional, Union

# Document processing libraries
from unstructured.partition.html import partition_html
from unstructured.partition.pdf import partition_pdf
from bs4 import BeautifulSoup
import PyPDF2
import tabula

# Word document processing
import docx
from docx.document import Document as DocxDocument
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.text.paragraph import Paragraph

# Image processing (for visual content extraction)
import pytesseract
from PIL import Image

# Visualization
import matplotlib.pyplot as plt
import networkx as nx
from IPython.display import display, Markdown, HTML

print("All required libraries imported successfully.")

## 2. Load and Analyze Document Structure

We'll build on the existing DocumentAnalyzer class to improve its heading detection capabilities. First, let's enhance the document structure analysis to better identify potential headings.

In [None]:
class EnhancedDocumentAnalyzer:
    """Enhanced class to analyze document structure with improved heading detection."""

    def __init__(self, debug_mode=False):
        """Initialize the enhanced document analyzer.
        
        Args:
            debug_mode: Whether to print detailed debugging information
        """
        self.debug_mode = debug_mode
        # Track statistics about document structure
        self.stats = {
            'total_headings': 0,
            'first_page_headings': 0,
            'missed_headings_recovered': 0
        }
        
    def load_document(self, file_path: str) -> Dict:
        """
        Load document from file path and return structured content with enhanced heading detection.
        
        Args:
            file_path: Path to the document file
            
        Returns:
            Dictionary with structured document content
        """
        # Get file extension
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()
        
        if ext == '.pdf':
            return self._process_pdf(file_path)
        elif ext in ['.html', '.htm']:
            return self._process_html(file_path)
        elif ext in ['.txt', '.md']:
            return self._process_text(file_path)
        elif ext in ['.docx', '.doc']:
            return self._process_word(file_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")

    def _log_debug(self, message):
        """Log debug information if debug mode is enabled."""
        if self.debug_mode:
            print(f"DEBUG: {message}")

### 2.1 Enhanced PDF Processing

PDF documents often have headings with different formatting, especially on the first page (title page). Let's enhance the PDF processor to better detect these headings.

In [None]:
def _process_pdf(self, file_path: str) -> Dict:
    """Process PDF documents with enhanced heading detection.
    
    This method focuses on better detecting:
    1. First page headings (often missed due to different formatting)
    2. Headings identified by font size and style rather than explicit structure
    3. Headings with no explicit level indicators
    """
    document_structure = {
        'metadata': {'source': file_path, 'type': 'pdf'},
        'elements': []
    }
    
    try:
        # Use PyPDF2 to extract font information for better heading detection
        pdf_reader = PyPDF2.PdfReader(file_path)
        first_page_content = None
        total_pages = len(pdf_reader.pages)
        
        self._log_debug(f"PDF has {total_pages} pages")
        
        # First pass: extract text from each page with font details using PyPDF2
        page_texts_with_fonts = []
        
        # Extract first page separately for special processing
        if total_pages > 0:
            first_page = pdf_reader.pages[0]
            first_page_content = first_page.extract_text()
            page_texts_with_fonts.append(first_page_content)
            
            # Special handling for first page - look for potential title and headings
            self._extract_first_page_headings(first_page_content, document_structure)
        
        # Now use unstructured for better extraction
        try:
            # Check if poppler is available
            import shutil
            if not shutil.which('pdftotext'):
                print("Warning: poppler-utils not found in PATH. Installing...")
                import subprocess
                subprocess.run(["apt-get", "update"], check=True)
                subprocess.run(["apt-get", "install", "-y", "poppler-utils"], check=True)
                print("poppler-utils installed successfully")

            # Read PDF and extract text with unstructured library
            elements = partition_pdf(
                filename=file_path,
                extract_images=True,
                infer_table_structure=True
            )

            # Extract tables separately using tabula
            tables = tabula.read_pdf(file_path, pages='all')
            
            # Track page numbers for better heading hierarchy detection
            current_page = 1
            page_heading_count = {}
            
            # Process elements and categorize them with enhanced heading detection
            for element in elements:
                elem_type = str(type(element)).lower()
                element_data = {
                    'content': str(element),
                    'type': None,
                    'page': getattr(element, 'metadata', {}).get('page_number', None)
                }
                
                # Update current page if available
                if element_data['page'] is not None:
                    current_page = element_data['page']
                    
                # Enhanced heading detection logic
                if 'title' in elem_type or 'heading' in elem_type:
                    element_data['type'] = 'heading'
                    
                    # Try to infer heading level
                    heading_text = str(element)
                    inferred_level = self._infer_heading_level(heading_text, current_page)
                    element_data['level'] = inferred_level
                    
                    # Track headings by page
                    page_heading_count[current_page] = page_heading_count.get(current_page, 0) + 1
                    
                    # Update stats
                    self.stats['total_headings'] += 1
                    if current_page == 1:
                        self.stats['first_page_headings'] += 1
                        
                elif 'table' in elem_type:
                    element_data['type'] = 'table'
                elif 'image' in elem_type:
                    element_data['type'] = 'image' 
                    # Try OCR on images to find possible headings in images
                    self._process_image_for_headings(element, document_structure)
                elif 'text' in elem_type:
                    # Enhanced text analysis to detect headings by formatting
                    heading_result = self._detect_heading_in_text(str(element), current_page)
                    
                    if heading_result['is_heading']:
                        element_data['type'] = 'heading'
                        element_data['level'] = heading_result['level']
                        self.stats['total_headings'] += 1
                        self.stats['missed_headings_recovered'] += 1
                        if current_page == 1:
                            self.stats['first_page_headings'] += 1
                    # Check if it's strikeout or highlighted
                    elif '~~' in str(element) or '--' in str(element):
                        element_data['type'] = 'strikeout'
                    elif any(marker in str(element) for marker in ['**', '__', '>>']): 
                        element_data['type'] = 'highlight'
                    else:
                        element_data['type'] = 'paragraph'

                document_structure['elements'].append(element_data)

            # Add tables from tabula to our elements list
            for i, table in enumerate(tables):
                document_structure['elements'].append({
                    'content': table,
                    'type': 'table',
                    'pandas_table': True,
                    'table_id': i
                })
                
            self._log_debug(f"Heading distribution by page: {page_heading_count}")
            
            # Post-process to ensure first page headings are properly represented
            self._ensure_first_page_headings(document_structure)
            
        except Exception as e:
            self._log_debug(f"Error in unstructured PDF processing: {e}")
            # Fallback method using PyPDF2 if unstructured fails
            self._process_pdf_fallback(pdf_reader, document_structure)
            
    except Exception as e:
        print(f"Error processing PDF: {e}")
        document_structure['elements'].append({
            'content': f"Error processing PDF file: {str(e)}",
            'type': 'paragraph',
        })

    return document_structure
    
EnhancedDocumentAnalyzer._process_pdf = _process_pdf

### 2.2 Additional Helper Methods for PDF Heading Extraction

Let's implement the helper methods for enhanced heading detection in PDFs.

In [None]:
def _extract_first_page_headings(self, first_page_text: str, document_structure: Dict):
    """Extract potential headings from the first page of a document."""
    if not first_page_text:
        return
        
    self._log_debug("Analyzing first page text for headings")
    
    # Split into lines
    lines = first_page_text.split('\n')
    
    # Process the first few lines (likely to contain title and headings)
    for i, line in enumerate(lines[:10]):  # Focus on first 10 lines
        line = line.strip()
        if not line:
            continue
            
        # Possible title/heading heuristics:
        # 1. All caps line is likely a heading
        # 2. Short line centered on page likely a heading
        # 3. Line ending with colon without ending punctuation might be a heading
        
        if len(line) > 3:  # Skip very short lines
            is_heading = False
            level = 1  # Default to top level for first page
            
            if line.isupper() and len(line) > 5:  # All caps, likely a heading
                is_heading = True
                level = 1 if i < 3 else 2  # First few all-caps lines are likely top level
            
            elif len(line) < 60 and not any(p in line for p in ['.', '?', '!']):
                # Short line without sentence-ending punctuation
                if i < 3:  # First few lines
                    is_heading = True
                    level = 1 if i == 0 else 2
                elif line.endswith(':'):  # Line ending with colon
                    is_heading = True
                    level = 2
                    
            if is_heading:
                # Add as a heading
                document_structure['elements'].append({
                    'content': line,
                    'type': 'heading',
                    'level': level,
                    'page': 1,
                    'extracted_from': 'first_page_analysis'
                })
                self.stats['first_page_headings'] += 1
                self.stats['total_headings'] += 1
                self.stats['missed_headings_recovered'] += 1
                self._log_debug(f"Detected first page heading: {line}")

def _infer_heading_level(self, heading_text: str, page_number: int = 1) -> int:
    """Infer heading level based on text characteristics."""
    # Simple heuristics to determine heading level
    if page_number == 1:
        if len(heading_text) < 30 and heading_text.isupper():
            return 1  # Main title
        else:
            return 2  # Subtitle on first page
    
    # For other pages, use text characteristics
    heading_text = heading_text.strip()
    
    # Check for numbered headings like "1.", "1.1", "1.1.1"
    if re.match(r'^\d+\.', heading_text):
        return 1
    elif re.match(r'^\d+\.\d+', heading_text):
        return 2
    elif re.match(r'^\d+\.\d+\.\d+', heading_text):
        return 3
        
    # Check for all caps (often indicates higher level headings)
    if heading_text.isupper():
        return 1
        
    # Default to level 2 for most detected headings
    return 2

def _detect_heading_in_text(self, text: str, page_number: int = 1) -> Dict:
    """Determine if a text element might be a heading based on characteristics."""
    text = text.strip()
    result = {'is_heading': False, 'level': 0}
    
    # Skip very long text - headings tend to be shorter
    if len(text) > 100:
        return result
        
    # Skip text with multiple sentences if not on first page
    if page_number > 1 and len(re.findall(r'[.!?]', text)) > 1:
        return result
    
    # Strong heading indicators
    if re.match(r'^(chapter|section|title|appendix)\s+\w+', text.lower()):
        result['is_heading'] = True
        result['level'] = 1
        return result
        
    # Numbered heading patterns
    if re.match(r'^\d+\.\s+[A-Z]', text):  # Like "1. TITLE"
        result['is_heading'] = True
        result['level'] = 1
        return result
    
    if re.match(r'^\d+\.\d+\.\s+[A-Z]', text):  # Like "1.1. Title"
        result['is_heading'] = True
        result['level'] = 2
        return result
    
    # Formatting indicators
    if text.isupper() and 5 < len(text) < 70:  # ALL CAPS text of reasonable length
        result['is_heading'] = True
        result['level'] = 1
        return result
        
    # First page special cases
    if page_number == 1:
        # Short standalone text at the top of the first page
        if len(text) < 80 and not any(p in text for p in ['.', ',', ';', '?', '!']):
            result['is_heading'] = True
            result['level'] = 1 if len(text) < 40 else 2
            return result
    
    return result

def _process_image_for_headings(self, image_element, document_structure: Dict):
    """Try to extract text from images to find headings."""
    # This would use OCR to find text in images
    # For now, we'll just add a placeholder comment
    # In a complete implementation, we would:
    # 1. Extract the image
    # 2. Use pytesseract to get text
    # 3. Analyze text for potential headings
    self._log_debug("Image processing for headings would occur here")

def _process_pdf_fallback(self, pdf_reader, document_structure: Dict):
    """Fallback method to extract content from PDF using PyPDF2."""
    self._log_debug("Using PDF fallback method")
    
    for i, page in enumerate(pdf_reader.pages):
        page_text = page.extract_text()
        page_number = i + 1
        
        # For first page, do special processing
        if page_number == 1:
            self._extract_first_page_headings(page_text, document_structure)
        
        # Simple paragraph splitting
        paragraphs = [p for p in page_text.split('\n\n') if p.strip()]
        
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue
                
            # Check if paragraph might be a heading
            heading_info = self._detect_heading_in_text(para, page_number)
            
            if heading_info['is_heading']:
                document_structure['elements'].append({
                    'content': para,
                    'type': 'heading',
                    'level': heading_info['level'],
                    'page': page_number,
                    'extracted_from': 'fallback_method'
                })
                self.stats['total_headings'] += 1
                self.stats['missed_headings_recovered'] += 1
            else:
                document_structure['elements'].append({
                    'content': para,
                    'type': 'paragraph',
                    'page': page_number
                })
                
def _ensure_first_page_headings(self, document_structure: Dict):
    """Make sure we have at least one heading from the first page."""
    first_page_headings = [e for e in document_structure['elements'] 
                           if e['type'] == 'heading' and 
                              (e.get('page') == 1 or e.get('page') is None)]
    
    if not first_page_headings:
        self._log_debug("No first page headings found, looking for potential candidates")
        # Look for potential headings in first page paragraphs
        first_page_paragraphs = [e for e in document_structure['elements']
                                if e['type'] == 'paragraph' and
                                   (e.get('page') == 1 or e.get('page') is None)]
        
        # Take the first short paragraph as a potential heading
        for para in first_page_paragraphs:
            content = para['content']
            if len(content) < 80 and not any(p in content for p in ['.', ',', ';', '?', '!']):
                # Convert to heading
                para['type'] = 'heading'
                para['level'] = 1
                para['extracted_from'] = 'fallback_first_page'
                self.stats['first_page_headings'] += 1
                self.stats['total_headings'] += 1
                self.stats['missed_headings_recovered'] += 1
                self._log_debug(f"Converted paragraph to heading: {content}")
                break

# Add methods to the EnhancedDocumentAnalyzer class
EnhancedDocumentAnalyzer._extract_first_page_headings = _extract_first_page_headings
EnhancedDocumentAnalyzer._infer_heading_level = _infer_heading_level
EnhancedDocumentAnalyzer._detect_heading_in_text = _detect_heading_in_text
EnhancedDocumentAnalyzer._process_image_for_headings = _process_image_for_headings
EnhancedDocumentAnalyzer._process_pdf_fallback = _process_pdf_fallback
EnhancedDocumentAnalyzer._ensure_first_page_headings = _ensure_first_page_headings

## 3. Enhance Heading Detection Logic for Different Document Types

Now let's enhance the processors for other document types, starting with the Word document processor, which needs similar improvements for heading detection.

In [None]:
def _process_word(self, file_path: str) -> Dict:
    """
    Enhanced Word document processor with improved heading detection.
    
    Args:
        file_path: Path to the Word document
        
    Returns:
        Dictionary with structured document content
    """
    try:
        document_structure = {
            'metadata': {'source': file_path, 'type': 'docx'},
            'elements': []
        }
        
        # Load document with python-docx
        document = docx.Document(file_path)
        
        # Extract metadata early
        try:
            core_properties = document.core_properties
            document_structure['metadata']['title'] = core_properties.title
            document_structure['metadata']['author'] = core_properties.author
            document_structure['metadata']['created'] = str(core_properties.created)
            document_structure['metadata']['modified'] = str(core_properties.modified)
        except Exception as e:
            self._log_debug(f"Error extracting document properties: {e}")
        
        # First pass: gather font statistics to determine normal vs. heading text
        font_sizes = []
        for paragraph in document.paragraphs:
            for run in paragraph.runs:
                if run.font.size:
                    # Convert to points if needed and store
                    size = run.font.size.pt if hasattr(run.font.size, 'pt') else run.font.size / 12700
                    font_sizes.append(size)
        
        # Calculate stats if we have enough samples
        normal_font_size = None
        if font_sizes:
            # Use mode or median as normal font size
            from statistics import median, mode
            try:
                normal_font_size = mode(font_sizes)
            except:
                normal_font_size = median(font_sizes)
            self._log_debug(f"Detected normal font size: {normal_font_size}")
        
        # Track if we're in first page content
        first_page_headings_found = False
        
        # Process paragraphs
        for i, paragraph in enumerate(document.paragraphs):
            text = paragraph.text.strip()
            if not text:
                continue
                
            # Determine if paragraph is likely a heading
            is_heading = False
            heading_level = 0
            
            # Check paragraph style - most reliable way
            if paragraph.style and 'heading' in paragraph.style.name.lower():
                is_heading = True
                try:
                    # Try to extract level from style name (e.g., "Heading 1" -> 1)
                    heading_match = re.search(r'heading\s+(\d+)', paragraph.style.name.lower())
                    if heading_match:
                        heading_level = int(heading_match.group(1))
                    else:
                        # Default based on order of appearance
                        heading_level = 1 if i < 5 else 2
                except:
                    heading_level = 1 if i < 5 else 2
            
            # If not detected by style, check alternative indicators
            elif not is_heading:
                # Check if it's a short paragraph near the beginning (potential title)
                if i < 5 and len(text) < 100:
                    # Check font attributes
                    has_larger_font = False
                    is_bold = False
                    
                    for run in paragraph.runs:
                        if run.bold:
                            is_bold = True
                        if run.font.size and normal_font_size:
                            # Convert to points if needed
                            size = run.font.size.pt if hasattr(run.font.size, 'pt') else run.font.size / 12700
                            if size > normal_font_size * 1.2:  # 20% larger than normal
                                has_larger_font = True
                    
                    # Mark as heading if formatting suggests it
                    if has_larger_font or is_bold:
                        is_heading = True
                        heading_level = 1 if i < 2 else 2
                
                # Check for numbered heading patterns
                if not is_heading:
                    # Look for patterns like "1.", "1.1", "Chapter 1", etc.
                    if (re.match(r'^\d+\.', text) or 
                        re.match(r'^chapter\s+\d+', text.lower()) or
                        re.match(r'^section\s+\d+', text.lower())):
                        is_heading = True
                        heading_level = 1
                    elif re.match(r'^\d+\.\d+\.', text):
                        is_heading = True
                        heading_level = 2
                    elif re.match(r'^\d+\.\d+\.\d+\.', text):
                        is_heading = True
                        heading_level = 3
            
            # Add element to the document structure
            if is_heading:
                document_structure['elements'].append({
                    'content': text,
                    'type': 'heading',
                    'level': heading_level
                })
                self.stats['total_headings'] += 1
                
                # Check if heading is likely on first page
                if i < 10:  # Assuming first ~10 paragraphs are first page
                    first_page_headings_found = True
                    self.stats['first_page_headings'] += 1
            else:
                # Check for highlighted or strikeout text
                if any(run.bold for run in paragraph.runs):
                    document_structure['elements'].append({
                        'content': text,
                        'type': 'highlight'
                    })
                elif any(run.font.strike for run in paragraph.runs):
                    document_structure['elements'].append({
                        'content': text,
                        'type': 'strikeout'
                    })
                else:
                    document_structure['elements'].append({
                        'content': text,
                        'type': 'paragraph'
                    })
        
        # Process tables
        for table in document.tables:
            # Convert table to pandas DataFrame
            data = []
            headers = []
            
            # Get headers from first row
            if table.rows:
                for cell in table.rows[0].cells:
                    headers.append(cell.text.strip())
            
            # Get data from remaining rows
            for row in table.rows[1:]:
                row_data = []
                for cell in row.cells:
                    row_data.append(cell.text.strip())
                data.append(row_data)
            
            # Create pandas DataFrame if possible
            try:
                if headers and data and len(headers) == len(data[0]):
                    df = pd.DataFrame(data, columns=headers)
                    document_structure['elements'].append({
                        'content': df,
                        'type': 'table',
                        'pandas_table': True
                    })
                else:
                    # Create simple text representation for table
                    table_text = "Table content:\n"
                    for row in table.rows:
                        row_text = [cell.text.strip() for cell in row.cells]
                        table_text += " | ".join(row_text) + "\n"
                    document_structure['elements'].append({
                        'content': table_text,
                        'type': 'table',
                        'pandas_table': False
                    })
            except Exception as e:
                self._log_debug(f"Error converting table to DataFrame: {e}")
                table_text = "Table content (error converting):\n"
                for row in table.rows:
                    row_text = [cell.text.strip() for cell in row.cells]
                    table_text += " | ".join(row_text) + "\n"
                document_structure['elements'].append({
                    'content': table_text,
                    'type': 'table',
                    'pandas_table': False
                })
        
        # If no first page headings found, try to identify one
        if not first_page_headings_found:
            self._ensure_word_first_page_heading(document_structure)
            
        return document_structure
        
    except ImportError:
        print("python-docx package not found. Please install it with 'pip install python-docx'")
        return {
            'metadata': {'source': file_path, 'type': 'docx'},
            'elements': [{
                'content': f"python-docx package required for Word processing: {file_path}",
                'type': 'paragraph',
            }]
        }
    except Exception as e:
        print(f"Error processing Word document: {e}")
        return {
            'metadata': {'source': file_path, 'type': 'docx'},
            'elements': [{
                'content': f"Error processing Word document: {file_path}. Error: {str(e)}",
                'type': 'paragraph',
            }]
        }

def _ensure_word_first_page_heading(self, document_structure: Dict):
    """Make sure we have at least one heading identified from the first page of a Word doc."""
    first_few_elements = document_structure['elements'][:10]  # First ~10 elements likely from first page
    
    # Check if any heading in first few elements
    if any(e['type'] == 'heading' for e in first_few_elements):
        return
        
    # If not, look for a potential title in the first few paragraphs
    for i, element in enumerate(first_few_elements):
        if element['type'] == 'paragraph':
            content = element['content']
            # Check if this could be a title (short, no ending punctuation)
            if len(content) < 80 and not any(p in content[-1:] for p in ['.', ',', ';', '?', '!']):
                # Convert to heading
                element['type'] = 'heading'
                element['level'] = 1
                self.stats['first_page_headings'] += 1
                self.stats['total_headings'] += 1
                self.stats['missed_headings_recovered'] += 1
                self._log_debug(f"Converted first paragraph to heading: {content}")
                
                # If document has title metadata, compare similarity
                if 'title' in document_structure['metadata'] and document_structure['metadata']['title']:
                    doc_title = document_structure['metadata']['title']
                    if self._text_similarity(content, doc_title) > 0.7:
                        self._log_debug(f"Confirmed as title by metadata match: {content}")
                break

def _text_similarity(self, text1, text2):
    """Calculate similarity between two text strings."""
    # Simple similarity measure based on word overlap
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    
    if not words1 or not words2:
        return 0
        
    intersection = words1.intersection(words2)
    return len(intersection) / min(len(words1), len(words2))

# Add methods to the EnhancedDocumentAnalyzer class
EnhancedDocumentAnalyzer._process_word = _process_word
EnhancedDocumentAnalyzer._ensure_word_first_page_heading = _ensure_word_first_page_heading
EnhancedDocumentAnalyzer._text_similarity = _text_similarity

### 3.1 Enhanced HTML Processing

Let's improve the HTML processor to better detect headings, especially on the first page.

In [None]:
def _process_html(self, file_path: str) -> Dict:
    """Enhanced process for HTML documents with better heading detection."""
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, 'lxml')
    document_structure = {
        'metadata': {'source': file_path, 'type': 'html'},
        'elements': []
    }
    
    # Extract metadata from HTML head
    try:
        # Get title
        if soup.title:
            document_structure['metadata']['title'] = soup.title.string
            
        # Get meta description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc:
            document_structure['metadata']['description'] = meta_desc.get('content', '')
            
        # Get author
        meta_author = soup.find('meta', attrs={'name': 'author'})
        if meta_author:
            document_structure['metadata']['author'] = meta_author.get('content', '')
    except Exception as e:
        self._log_debug(f"Error extracting HTML metadata: {e}")
    
    # First pass: look for explicit heading elements (h1-h6)
    explicit_headings = []
    for heading_level in range(1, 7):
        for heading in soup.find_all(f'h{heading_level}'):
            heading_text = heading.get_text().strip()
            if heading_text:  # Skip empty headings
                document_structure['elements'].append({
                    'content': heading_text,
                    'type': 'heading',
                    'level': heading_level
                })
                explicit_headings.append(heading_text)
                self.stats['total_headings'] += 1
    
    # Second pass: look for other elements that might be headings
    # Focus on first few elements for first-page headings
    elements_processed = 0
    found_first_heading = False
    
    # Check for title in first div/header if no h1 found
    if not any(h['level'] == 1 for h in document_structure['elements'] if h['type'] == 'heading'):
        # Look in header, div.header, or first major div
        header_candidates = [
            soup.find('header'),
            soup.find('div', class_='header'),
            soup.find('div', id='header'),
            soup.find('div', class_='title'),
            soup.find('div', class_='main-title')
        ]
        
        for candidate in header_candidates:
            if candidate:
                # Skip if we already captured this in explicit headings
                candidate_text = candidate.get_text().strip()
                if not candidate_text or candidate_text in explicit_headings:
                    continue
                    
                # Look for large text or strong/b elements
                potential_title = candidate.find(['strong', 'b', 'span', 'div'])
                if potential_title:
                    title_text = potential_title.get_text().strip()
                    if title_text and len(title_text) < 100:
                        document_structure['elements'].insert(0, {
                            'content': title_text,
                            'type': 'heading',
                            'level': 1,
                            'extracted_from': 'header'
                        })
                        found_first_heading = True
                        self.stats['first_page_headings'] += 1
                        self.stats['total_headings'] += 1
                        self.stats['missed_headings_recovered'] += 1
                        break
                        
                # If no specific element found, use the header text itself
                if not found_first_heading and len(candidate_text) < 100:
                    document_structure['elements'].insert(0, {
                        'content': candidate_text,
                        'type': 'heading',
                        'level': 1,
                        'extracted_from': 'header_text'
                    })
                    found_first_heading = True
                    self.stats['first_page_headings'] += 1
                    self.stats['total_headings'] += 1
                    self.stats['missed_headings_recovered'] += 1
                    break
    
    # Extract paragraphs
    for para in soup.find_all('p'):
        elements_processed += 1
        is_first_page = elements_processed < 10  # Simple heuristic for first page
        
        # Check for highlighted text
        highlighted = para.find_all(['strong', 'b', 'mark', 'em'])
        strikeout = para.find_all('s')
        
        if highlighted:
            for h in highlighted:
                document_structure['elements'].append({
                    'content': h.get_text(),
                    'type': 'highlight',
                })
        
        if strikeout:
            for s in strikeout:
                document_structure['elements'].append({
                    'content': s.get_text(),
                    'type': 'strikeout',
                })
        
        # Add the full paragraph
        para_text = para.get_text().strip()
        if not para_text:
            continue
            
        # Check if this might be a missed heading (early in document, short, etc.)
        if not found_first_heading and is_first_page:
            # Short paragraphs near the top that might be headings
            if len(para_text) < 80 and not any(p in para_text for p in ['.', ',', ';', '?', '!']):
                # Likely a heading if it's short without punctuation
                document_structure['elements'].append({
                    'content': para_text,
                    'type': 'heading',
                    'level': 1 if elements_processed < 3 else 2,
                    'extracted_from': 'first_paragraph'
                })
                found_first_heading = True
                self.stats['first_page_headings'] += 1
                self.stats['total_headings'] += 1
                self.stats['missed_headings_recovered'] += 1
                continue
            
            # Check for CSS class that suggests heading
            css_classes = para.get('class', [])
            for css_class in css_classes:
                if 'title' in css_class.lower() or 'heading' in css_class.lower():
                    document_structure['elements'].append({
                        'content': para_text,
                        'type': 'heading',
                        'level': 1 if 'title' in css_class.lower() else 2,
                        'extracted_from': 'css_class'
                    })
                    found_first_heading = True
                    self.stats['first_page_headings'] += 1
                    self.stats['total_headings'] += 1
                    self.stats['missed_headings_recovered'] += 1
                    continue
        
        # Normal paragraph
        document_structure['elements'].append({
            'content': para_text,
            'type': 'paragraph',
        })
    
    # Extract tables
    for table in soup.find_all('table'):
        # Convert HTML table to pandas DataFrame
        table_data = []
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all(['td', 'th'])
            cols = [ele.get_text().strip() for ele in cols]
            table_data.append(cols)
        
        if table_data:
            # Try to create a pandas DataFrame
            try:
                df = pd.DataFrame(table_data[1:], columns=table_data[0])
                document_structure['elements'].append({
                    'content': df,
                    'type': 'table',
                    'pandas_table': True
                })
            except:
                # Fallback to string representation
                document_structure['elements'].append({
                    'content': str(table_data),
                    'type': 'table',
                    'pandas_table': False
                })
    
    # Extract images
    for img in soup.find_all('img'):
        alt_text = img.get('alt', 'Image')
        src = img.get('src', '')
        document_structure['elements'].append({
            'content': f"{alt_text} (src: {src})",
            'type': 'image',
        })
        
        # Check if this image might have a caption that's actually a heading
        if img.parent and (img.parent.name in ['figure', 'div'] and img.parent.find('figcaption')):
            caption = img.parent.find('figcaption').get_text().strip()
            if len(caption) < 80 and not found_first_heading and elements_processed < 5:
                # This could be a title with image
                document_structure['elements'].append({
                    'content': caption,
                    'type': 'heading',
                    'level': 1,
                    'extracted_from': 'image_caption'
                })
                found_first_heading = True
                self.stats['first_page_headings'] += 1
                self.stats['total_headings'] += 1
                self.stats['missed_headings_recovered'] += 1
    
    # If still no first page heading found, try to use document title
    if not found_first_heading and 'title' in document_structure['metadata']:
        title = document_structure['metadata']['title']
        if title and len(title) < 100:
            document_structure['elements'].insert(0, {
                'content': title,
                'type': 'heading',
                'level': 1,
                'extracted_from': 'metadata_title'
            })
            self.stats['first_page_headings'] += 1
            self.stats['total_headings'] += 1
            self.stats['missed_headings_recovered'] += 1
    
    return document_structure

# Add method to the EnhancedDocumentAnalyzer class
EnhancedDocumentAnalyzer._process_html = _process_html

### 3.3 Enhanced Plain Text/Markdown Processing

Let's improve the processing of plain text and markdown files to better identify headings.

In [None]:
def _process_text(self, file_path: str) -> Dict:
    """Enhanced process for plain text or markdown documents with better heading detection."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    document_structure = {
        'metadata': {'source': file_path, 'type': 'text'},
        'elements': []
    }
    
    # Determine if this is likely markdown
    is_markdown = file_path.lower().endswith('.md') or '# ' in content or '## ' in content
    
    # Find potential title from first line
    found_first_heading = False
    lines = content.split('\n')
    
    if lines and lines[0].strip():
        first_line = lines[0].strip()
        # Check if first line is already a markdown heading
        if first_line.startswith('#'):
            found_first_heading = True
        elif len(first_line) < 80 and not any(p in first_line for p in ['.', ',', ';', '?', '!']):
            # First line looks like a title - add it as heading
            document_structure['elements'].append({
                'content': first_line,
                'type': 'heading',
                'level': 1,
                'extracted_from': 'first_line'
            })
            found_first_heading = True
            self.stats['first_page_headings'] += 1
            self.stats['total_headings'] += 1
            self.stats['missed_headings_recovered'] += 1
    
    # Split by double newlines to separate paragraphs
    paragraphs = content.split('\n\n')
    
    # Track if we're still in the "first page" (first few paragraphs)
    is_first_page = True
    paragraph_count = 0
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
            
        paragraph_count += 1
        if paragraph_count > 10:
            is_first_page = False
        
        # Check if it's a heading (starts with # in markdown)
        if para.startswith('#'):
            level = len(re.match(r'^#+', para).group())
            heading_text = para.lstrip('#').strip()
            document_structure['elements'].append({
                'content': heading_text,
                'type': 'heading',
                'level': level
            })
            self.stats['total_headings'] += 1
            if is_first_page:
                self.stats['first_page_headings'] += 1
                found_first_heading = True
        
        # Check for alternative heading formats (underlined with === or ---)
        elif paragraph_count < len(paragraphs) - 1:
            next_para = paragraphs[paragraph_count].strip()
            if next_para and (all(c == '=' for c in next_para) or all(c == '-' for c in next_para)):
                # This is a heading underlined with === or ---
                level = 1 if '=' in next_para else 2
                document_structure['elements'].append({
                    'content': para,
                    'type': 'heading',
                    'level': level
                })
                self.stats['total_headings'] += 1
                if is_first_page:
                    self.stats['first_page_headings'] += 1
                    found_first_heading = True
                continue  # Skip the underline row
                
        # Check if it's a table (simple detection for markdown tables)
        elif '|' in para and '-+-' in para.replace('|', '+'):
            document_structure['elements'].append({
                'content': para,
                'type': 'table',
                'pandas_table': False
            })
            
        # Check for strikeout text (~~text~~ in markdown)
        elif '~~' in para:
            document_structure['elements'].append({
                'content': para,
                'type': 'strikeout',
            })
            
        # Check for highlighted text (** or __ in markdown)
        elif '**' in para or '__' in para:
            document_structure['elements'].append({
                'content': para,
                'type': 'highlight',
            })
            
        # Check for numbered headings or other patterns that suggest headings
        elif is_first_page and not found_first_heading and (
            re.match(r'^(\d+\.)+\s+', para) or  # Numbered heading like "1.2.3 Title"
            re.match(r'^(Chapter|Section|Title|Part)\s+\d+', para, re.IGNORECASE)  # Named headings
        ):
            document_structure['elements'].append({
                'content': para,
                'type': 'heading',
                'level': 1 if paragraph_count <= 2 else 2,
                'extracted_from': 'numbered_pattern'
            })
            self.stats['first_page_headings'] += 1
            self.stats['total_headings'] += 1
            self.stats['missed_headings_recovered'] += 1
            found_first_heading = True
            
        # Regular paragraph
        else:
            document_structure['elements'].append({
                'content': para,
                'type': 'paragraph',
            })
    
    # If we haven't found a first page heading and this isn't markdown, try harder
    if not found_first_heading and not is_markdown:
        self._ensure_text_first_page_heading(document_structure)
    
    return document_structure

def _ensure_text_first_page_heading(self, document_structure: Dict):
    """Make sure we have at least one heading from a plain text document."""
    # Look at the first few elements
    first_elements = document_structure['elements'][:5]
    
    for i, element in enumerate(first_elements):
        if element['type'] == 'paragraph':
            text = element['content']
            
            # First paragraph is often a title in plain text documents
            if i == 0 and len(text) < 80:
                element['type'] = 'heading'
                element['level'] = 1
                element['extracted_from'] = 'first_paragraph'
                self.stats['first_page_headings'] += 1
                self.stats['total_headings'] += 1
                self.stats['missed_headings_recovered'] += 1
                self._log_debug(f"Converted first paragraph to heading: {text}")
                return
                
            # Check for ALL CAPS paragraphs (often headings)
            if text.isupper() and 5 < len(text) < 80:
                element['type'] = 'heading'
                element['level'] = 1 if i < 2 else 2
                element['extracted_from'] = 'all_caps'
                self.stats['first_page_headings'] += 1
                self.stats['total_headings'] += 1
                self.stats['missed_headings_recovered'] += 1
                self._log_debug(f"Converted all caps paragraph to heading: {text}")
                return
                
            # Check for centered text (approximated by leading/trailing spaces)
            if text.startswith('    ') and text.rstrip().endswith('    '):
                element['type'] = 'heading'
                element['level'] = 1
                element['extracted_from'] = 'centered_text'
                self.stats['first_page_headings'] += 1
                self.stats['total_headings'] += 1
                self.stats['missed_headings_recovered'] += 1
                self._log_debug(f"Converted centered paragraph to heading: {text}")
                return

# Add methods to the EnhancedDocumentAnalyzer class
EnhancedDocumentAnalyzer._process_text = _process_text
EnhancedDocumentAnalyzer._ensure_text_first_page_heading = _ensure_text_first_page_heading

## 4. Process Different Document Types

Now let's create a demonstration function that processes different document types and reports on the improvements in heading detection.