In [4]:
pip install pymupdf


Collecting pymupdf
  Using cached pymupdf-1.25.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Using cached pymupdf-1.25.1-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install dataclasses


Collecting dataclasses
  Using cached dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Installing collected packages: dataclasses
Successfully installed dataclasses-0.6
Note: you may need to restart the kernel to use updated packages.


In [11]:
%pip install cv

Collecting cv
  Using cached cv-1.0.0-py3-none-any.whl.metadata (3.6 kB)
Using cached cv-1.0.0-py3-none-any.whl (7.3 kB)
Installing collected packages: cv
Successfully installed cv-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install opencv-python-headless

Collecting opencv-python-headless
  Downloading opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata (20 kB)
Downloading opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: opencv-python-headless
Successfully installed opencv-python-headless-4.11.0.86
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install pdf2image

Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Using cached pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install pytesseract

Collecting pytesseract
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Using cached pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [2]:
import fitz
from typing import Dict, List, Tuple
from dataclasses import dataclass
from pathlib import Path
from enum import Enum

In [5]:
class HeadingType(Enum):
    MAJOR = "major"
    MINOR = "minor"
    SUB = "sub"
    NORMAL = "normal"

@dataclass
class TextMetadata:
    text: str
    font_name: str
    font_size: float
    is_bold: bool
    is_italic: bool
    color: tuple
    coordinates: tuple
    block_type: HeadingType

class PDFMetadataExtractor:
    def __init__(self, pdf_path: str, header_height_percent: float = 10, footer_height_percent: float = 10):
        self.pdf_path = Path(pdf_path)
        if not self.pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
        
        self.doc = fitz.open(pdf_path)
        self.header_height_percent = header_height_percent
        self.footer_height_percent = footer_height_percent
        
    def _get_page_regions(self, page: fitz.Page) -> Tuple[float, float, float]:
        """Calculate header, content, and footer regions."""
        page_height = page.rect.height
        header_bottom = page_height * (self.header_height_percent / 100)
        footer_top = page_height * (1 - self.footer_height_percent / 100)
        return header_bottom, footer_top, page_height
    
    def _is_in_content_area(self, y_position: float, header_bottom: float, footer_top: float) -> bool:
        """Check if a position is in the main content area."""
        return header_bottom <= y_position <= footer_top

    def _analyze_font_sizes(self, page: fitz.Page, header_bottom: float, footer_top: float) -> Dict[str, float]:
        """Analyze font sizes to determine dynamic thresholds for headings."""
        font_sizes = []
        dict_data = page.get_text("dict")
        
        for block in dict_data.get("blocks", []):
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    y_pos = span.get("bbox", (0, 0, 0, 0))[1]
                    if self._is_in_content_area(y_pos, header_bottom, footer_top):
                        if span.get("size"):
                            font_sizes.append(span["size"])
        
        if not font_sizes:
            return {"avg": 0, "major": 0, "minor": 0, "sub": 0}
        
        avg_size = sum(font_sizes) / len(font_sizes)
        max_size = max(font_sizes)
        min_size = min(font_sizes)
        
        return {
            "avg": avg_size,
            "major": max_size * 0.9,  # Dynamic threshold closer to the max
            "minor": avg_size * 1.2,  # Minor headings often slightly larger than average
            "sub": avg_size * 1.05,   # Sub headings are slightly above the average
            "normal": avg_size * 0.8
        }

    def _determine_heading_type(
        self, 
        font_size: float, 
        is_bold: bool, 
        thresholds: Dict[str, float]
    ) -> HeadingType:
        """Determine heading type based on font size and properties."""
        if font_size >= thresholds["major"]:
            return HeadingType.MAJOR
        elif font_size >= thresholds["minor"] and is_bold:
            return HeadingType.MINOR
        elif font_size >= thresholds["sub"] and (is_bold or self._is_italic):
            return HeadingType.SUB
        return HeadingType.NORMAL
    
    def _is_bold(self, font_name: str) -> bool:
        """Check if the font is bold."""
        bold_indicators = ['bold', 'heavy', 'black', 'extrabold', 'ultrabold', 'demibold']
        return any(indicator in font_name.lower() for indicator in bold_indicators)
    
    def _is_italic(self, font_name: str) -> bool:
        """Check if the font is italic."""
        italic_indicators = ['italic', 'oblique']
        return any(indicator in font_name.lower() for indicator in italic_indicators)

    def extract_page_metadata(self, page_num: int) -> List[TextMetadata]:
        """Extract text metadata from a specific page."""
        page = self.doc[page_num]
        header_bottom, footer_top, _ = self._get_page_regions(page)
        font_thresholds = self._analyze_font_sizes(page, header_bottom, footer_top)
        
        metadata_list = []
        dict_data = page.get_text("dict")
        
        for block in dict_data.get("blocks", []):
            last_y_pos = None
            current_text = ""
            current_metadata = None
            
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    bbox = span.get("bbox", (0, 0, 0, 0))
                    y_pos = bbox[1]
                    
                    if not self._is_in_content_area(y_pos, header_bottom, footer_top):
                        continue
                    
                    text = span.get("text", "").strip()
                    if not text:
                        continue
                    
                    is_bold = self._is_bold(span.get("font", ""))
                    font_size = span.get("size", 0)
                    
                    heading_type = self._determine_heading_type(
                        font_size,
                        is_bold,
                        font_thresholds
                    )
                    
                    # Check if this span should be merged with the previous one
                    if (current_metadata and 
                        abs(y_pos - last_y_pos) < (font_size * 1.5) and 
                        heading_type == current_metadata.block_type):
                        
                        current_text += " " + text
                    else:
                        # Save the previous metadata if exists
                        if current_metadata:
                            current_metadata.text = current_text
                            metadata_list.append(current_metadata)
                        
                        # Start a new text metadata
                        current_text = text
                        current_metadata = TextMetadata(
                            text=current_text,
                            font_name=span.get("font", ""),
                            font_size=font_size,
                            is_bold=is_bold,
                            is_italic=self._is_italic(span.get("font", "")),
                            color=span.get("color", (0, 0, 0)),
                            coordinates=bbox,
                            block_type=heading_type
                        )
                    
                    last_y_pos = y_pos
            
            # Save the last metadata in the block if it exists
            if current_metadata:
                current_metadata.text = current_text
                metadata_list.append(current_metadata)
        
        return metadata_list

    def extract_headings_explicit(self) -> Dict[int, Dict[str, List[str]]]:
        """Extract and explicitly list headings categorized by type for each page."""
        all_headings = {}

        for page_num in range(self.doc.page_count):
            page_metadata = self.extract_page_metadata(page_num)
            all_headings[page_num + 1] = {  # Page numbers are 1-indexed
                "major_headings": [],
                "minor_headings": [],
                "sub_headings": [],
                "normal": []
            }

            for meta in page_metadata:
                if meta.block_type == HeadingType.MAJOR:
                    all_headings[page_num + 1]["major_headings"].append(meta.text)
                elif meta.block_type == HeadingType.MINOR:
                    all_headings[page_num + 1]["minor_headings"].append(meta.text)
                elif meta.block_type == HeadingType.SUB:
                    all_headings[page_num + 1]["sub_headings"].append(meta.text)
                elif meta.block_type == HeadingType.NORMAL:
                    all_headings[page_num + 1]["normal"].append(meta.text)

        return all_headings
    
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.doc.close()

# Usage Example
if __name__ == "__main__":
    pdf_path = "../Pre_Match Report New.pdf"  # Replace with your actual PDF file path
    with PDFMetadataExtractor(pdf_path, header_height_percent=5, footer_height_percent=5) as extractor:
        headings = extractor.extract_headings_explicit()
        for page_num, page_headings in headings.items():
            print(f"Page {page_num}:")
            print("Major Headings:")
            for heading in page_headings["major_headings"]:
                print(f"  - {heading}")
            print("Minor Headings:")
            for heading in page_headings["minor_headings"]:
                print(f"  - {heading}")
            print("Sub Headings:")
            for heading in page_headings["sub_headings"]:
                print(f"  - {heading}")
            print("Normal Headings:")
            for heading in page_headings["normal"]:
                print(f"  - {heading}")
            

Page 1:
Major Headings:
  - Head To Head
  - Form Guide - SOUTH AFRICA
  - Form Guide - INDIA
  - Venue Insights
  - Last 5 Encounters
Minor Headings:
Sub Headings:
Normal Headings:
  - Team Matches Wins Draws Tied Win %
  - South Africa 42 17 10 0 40.48
  - India 42 15 10 0 35.71
  - Opponent Venue Results Match Date Margin
  - WI Johannesburg Won 08 Mar 2023 284 Runs
  - WI Centurion Won 28 Feb 2023 87 Runs
  - AUS Sydney Draw 04 Jan 2023 -
  - AUS Melbourne Lost 26 Dec 2022 By Inns & 182 Runs
  - AUS Brisbane Lost 17 Dec 2022 6 Wkts
  - Opponent Venue Results Match Date Margin
  - WI Port Of Spain, Trinidad Draw 20 Jul 2023 -
  - WI Roseau, Dominica Won 12 Jul 2023
  - By Inns And 141
  - Runs
  - AUS London Lost 07 Jun 2023 209 Runs
  - AUS Ahmedabad Draw 09 Mar 2023 -
  - AUS Indore Lost 01 Mar 2023 9 Wkts
  - *Average Score Is Calculated For Last 10 Matches
  - SuperSport Park, Centurion
  - Highest Team Total: 621/10 By South Africa Against Sri Lanka In 2020.
  - Lowest Team Tot