# Extracting Heading from the PDF page

In [271]:
import fitz
from typing import Dict, List, Tuple
from dataclasses import dataclass
from pathlib import Path
from enum import Enum
from typing import Tuple, List

In [272]:
class HeadingType(Enum):
    MAJOR = "major"
    MINOR = "minor"
    SUB = "sub"
    NORMAL = "normal"

@dataclass
class TextMetadata:
    text: str
    font_name: str
    font_size: float
    is_bold: bool
    is_italic: bool
    color: tuple
    coordinates: tuple
    block_type: HeadingType

class PDFMetadataExtractor:
    def __init__(self, pdf_path: str, header_height_percent: float = 10, footer_height_percent: float = 10):
        self.pdf_path = Path(pdf_path)
        if not self.pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
        
        self.doc = fitz.open(pdf_path)
        self.header_height_percent = header_height_percent
        self.footer_height_percent = footer_height_percent
        
    def _get_page_regions(self, page: fitz.Page) -> Tuple[float, float, float]:
        """Calculate header, content, and footer regions."""
        page_height = page.rect.height
        header_bottom = page_height * (self.header_height_percent / 100)
        footer_top = page_height * (1 - self.footer_height_percent / 100)
        return header_bottom, footer_top, page_height
    
    def _is_in_content_area(self, y_position: float, header_bottom: float, footer_top: float) -> bool:
        """Check if a position is in the main content area."""
        return header_bottom <= y_position <= footer_top

    def _analyze_font_sizes(self, page: fitz.Page, header_bottom: float, footer_top: float) -> Dict[str, float]:
        """Analyze font sizes to determine dynamic thresholds for headings."""
        font_sizes = []
        dict_data = page.get_text("dict")
        
        for block in dict_data.get("blocks", []):
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    y_pos = span.get("bbox", (0, 0, 0, 0))[1]
                    if self._is_in_content_area(y_pos, header_bottom, footer_top):
                        if span.get("size"):
                            font_sizes.append(span["size"])
        
        if not font_sizes:
            return {"avg": 0, "major": 0, "minor": 0, "sub": 0}
        
        avg_size = sum(font_sizes) / len(font_sizes)
        max_size = max(font_sizes)
        min_size = min(font_sizes)
        
        return {
            "avg": avg_size,
            "major": max_size * 0.9,  # Dynamic threshold closer to the max
            "minor": avg_size * 1.2,  # Minor headings often slightly larger than average
            "sub": avg_size * 1.05   # Sub headings are slightly above the average
        }

    def _determine_heading_type(
        self, 
        font_size: float, 
        is_bold: bool, 
        thresholds: Dict[str, float]
    ) -> HeadingType:
        """Determine heading type based on font size and properties."""
        if font_size >= thresholds["major"]:
            return HeadingType.MAJOR
        elif font_size >= thresholds["minor"] and is_bold:
            return HeadingType.MINOR
        elif font_size >= thresholds["sub"] and (is_bold or self._is_italic):
            return HeadingType.SUB
        return HeadingType.NORMAL
    
    def _is_bold(self, font_name: str) -> bool:
        """Check if the font is bold."""
        bold_indicators = ['bold', 'heavy', 'black', 'extrabold', 'ultrabold', 'demibold']
        return any(indicator in font_name.lower() for indicator in bold_indicators)
    
    def _is_italic(self, font_name: str) -> bool:
        """Check if the font is italic."""
        italic_indicators = ['italic', 'oblique']
        return any(indicator in font_name.lower() for indicator in italic_indicators)

    def extract_page_metadata(self, page_num: int) -> List[TextMetadata]:
        """Extract text metadata from a specific page."""
        page = self.doc[page_num]
        header_bottom, footer_top, _ = self._get_page_regions(page)
        font_thresholds = self._analyze_font_sizes(page, header_bottom, footer_top)
        
        metadata_list = []
        dict_data = page.get_text("dict")
        
        for block in dict_data.get("blocks", []):
            last_y_pos = None
            current_text = ""
            current_metadata = None
            
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    bbox = span.get("bbox", (0, 0, 0, 0))
                    y_pos = bbox[1]
                    
                    if not self._is_in_content_area(y_pos, header_bottom, footer_top):
                        continue
                    
                    text = span.get("text", "").strip()
                    if not text:
                        continue
                    
                    is_bold = self._is_bold(span.get("font", ""))
                    font_size = span.get("size", 0)
                    
                    heading_type = self._determine_heading_type(
                        font_size,
                        is_bold,
                        font_thresholds
                    )
                    
                    # Check if this span should be merged with the previous one
                    if (current_metadata and 
                        abs(y_pos - last_y_pos) < (font_size * 1.5) and 
                        heading_type == current_metadata.block_type):
                        
                        current_text += " " + text
                    else:
                        # Save the previous metadata if exists
                        if current_metadata:
                            current_metadata.text = current_text
                            metadata_list.append(current_metadata)
                        
                        # Start a new text metadata
                        current_text = text
                        current_metadata = TextMetadata(
                            text=current_text,
                            font_name=span.get("font", ""),
                            font_size=font_size,
                            is_bold=is_bold,
                            is_italic=self._is_italic(span.get("font", "")),
                            color=span.get("color", (0, 0, 0)),
                            coordinates=bbox,
                            block_type=heading_type
                        )
                    
                    last_y_pos = y_pos
            
            # Save the last metadata in the block if it exists
            if current_metadata:
                current_metadata.text = current_text
                metadata_list.append(current_metadata)
        
        return metadata_list

    def extract_headings_explicit(self) -> Dict[int, Dict[str, List[str]]]:
        """Extract and explicitly list headings categorized by type for each page."""
        all_headings = {}

        for page_num in range(self.doc.page_count):
            page_metadata = self.extract_page_metadata(page_num)
            all_headings[page_num + 1] = {  # Page numbers are 1-indexed
                "major_headings": [],
                "minor_headings": [],
                "sub_headings": []
            }

            for meta in page_metadata:
                if meta.block_type == HeadingType.MAJOR:
                    all_headings[page_num + 1]["major_headings"].append(meta.text)
                elif meta.block_type == HeadingType.MINOR:
                    all_headings[page_num + 1]["minor_headings"].append(meta.text)
                elif meta.block_type == HeadingType.SUB:
                    all_headings[page_num + 1]["sub_headings"].append(meta.text)

        return all_headings
    
    
    
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.doc.close()

def extract_headings_from_page(
        pdf_path: str,
        page_number: int,
        header_height_percent: float = 5,
        footer_height_percent: float = 5
    ) -> Tuple[List[str], List[str], List[str]]:
        """
        Extracts major, minor, and sub-headings from a specific page of the PDF.

        Args:
            pdf_path (str): Path to the PDF file.
            page_number (int): 1-indexed page number to extract headings from.
            header_height_percent (float): Percentage of the page height to treat as header.
            footer_height_percent (float): Percentage of the page height to treat as footer.

        Returns:
            Tuple[List[str], List[str], List[str]]: Arrays of major, minor, and sub-headings.
        """
        with PDFMetadataExtractor(pdf_path, header_height_percent, footer_height_percent) as extractor:
            # Extract metadata for the specific page (convert to 0-indexed)
            page_metadata = extractor.extract_page_metadata(page_number - 1)
            
            # Initialize arrays for headings
            major_headings = []
            minor_headings = []
            sub_headings = []
            
            # Categorize the extracted metadata
            for meta in page_metadata:
                if meta.block_type == HeadingType.MAJOR:
                    major_headings.append(meta.text)
                elif meta.block_type == HeadingType.MINOR:
                    minor_headings.append(meta.text)
                elif meta.block_type == HeadingType.SUB:
                    sub_headings.append(meta.text)
            
            return major_headings, minor_headings, sub_headings

# Extracting the Different Kind of Headings from the PDF Page

In [273]:
if __name__ == "__main__":
    pdf_path = "../Pre_Match Report New.pdf"  # Replace with your actual PDF file path
    page_number = 14  # Replace with the specific page number (1-indexed)

    major, minor, sub = extract_headings_from_page(pdf_path, page_number)
    print(f"Major Headings: {major}")
    print(f"Minor Headings: {minor}")
    print(f"Sub Headings: {sub}")

Major Headings: ['Tests In SuperSport Park, Centurion']
Minor Headings: []
Sub Headings: ['India Bowling']


# Extracting Table From PDF


In [274]:
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
import os
import pytesseract
import re

In [351]:
class PDFTableExtractor:
    def __init__(self, pdf_path, output_dir="extracted_tables", heading_margin=200):
        self.pdf_path = pdf_path
        self.pages = None
        self.heading_margin = heading_margin
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def convert_pdf_to_images(self):
        """Convert PDF pages to images."""
        try:
            self.pages = convert_from_path(self.pdf_path, dpi=300)
            return self.pages
        except Exception as e:
            print(f"Error converting PDF to images: {str(e)}")
            return None

    def detect_text_regions(self, gray_image):
        """Detect text regions that might indicate table headers."""
        try:
            data = pytesseract.image_to_data(gray_image, output_type=pytesseract.Output.DICT)
            potential_headers = []
            
            for i in range(len(data['text'])):
                if int(data['conf'][i]) > 60:  # Filter for confident text detection
                    text = data['text'][i].strip().lower()
                    if len(text) > 3:  # Filter out very short text
                        x = data['left'][i]
                        y = data['top'][i]
                        w = data['width'][i]
                        h = data['height'][i]
                        potential_headers.append((x, y, w, h, text))
            # printing the potential headers
            # print("potential headers: ", potential_headers)
            return potential_headers
        except Exception as e:
            print(f"Error detecting text regions: {str(e)}")
            return []

    def is_likely_table(self, contour, image_shape, text_regions):
        """Enhanced table detection with improved filtering for headers/footers."""
        x, y, w, h = cv2.boundingRect(contour)
        area = cv2.contourArea(contour)
        rect_area = w * h
        perimeter = cv2.arcLength(contour, True)
        
        # Basic shape metrics
        circularity = 4 * np.pi * area / (perimeter * perimeter) if perimeter > 0 else 0
        rectangularity = area / rect_area if rect_area > 0 else 0
        aspect_ratio = w / h if h > 0 else 0
        relative_size = area / (image_shape[0] * image_shape[1])

        # Filter out regions that are likely headers or footers
        # Headers and footers typically span the full width of the page
        page_width = image_shape[1]
        is_full_width = w > page_width * 0.9
        
        # Headers and footers are usually at the top or bottom 10% of the page
        page_height = image_shape[0]
        is_at_edge = y < page_height * 0.1 or (y + h) > page_height * 0.9
        
        # If region is both full-width and at page edge, likely a header/footer
        if is_full_width and is_at_edge:
            return False

        # Check for table-like structure
        has_headers = False
        header_text_count = 0
        for tx, ty, tw, th, text in text_regions:
            if (x <= tx <= x + w and y <= ty <= y + h):
                has_headers = True
                header_text_count += 1

        # Require multiple text regions for a valid table
        if header_text_count < 2:
            return False

        # More lenient criteria for regions with headers
        if has_headers:
            return (
                circularity < 0.9 and
                rectangularity > 0.3 and
                0.2 < aspect_ratio < 25.0 and
                0.01 < relative_size < 0.8 and  # Adjusted size constraints
                w > 50 and h > 50 and  # Minimum dimensions
                w < page_width * 0.95  # Should not span entire page width
            )
        else:
            # Stricter criteria for regions without headers
            return (
                circularity < 0.7 and
                rectangularity > 0.6 and
                0.5 < aspect_ratio < 15.0 and
                0.02 < relative_size < 0.7 and
                w > 100 and h > 50 and
                w < page_width * 0.9
            )

    def detect_tables(self, page_image):
        """Enhanced table detection with improved filtering."""
        try:
            img_array = np.array(page_image)
            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
            
            # Detect text regions first
            text_regions = self.detect_text_regions(gray)
            
            # Multiple threshold approaches
            thresh_adaptive = cv2.adaptiveThreshold(
                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 3
            )
            
            _, thresh_binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            thresh = cv2.bitwise_or(thresh_adaptive, thresh_binary)
            
            # Morphological operations
            kernel_small = np.ones((3, 3), np.uint8)
            kernel_large = np.ones((5, 5), np.uint8)
            
            thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel_small)
            thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_small)
            thresh = cv2.dilate(thresh, kernel_large, iterations=1)

            # Find contours
            contours_ext, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            contours_tree, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
            
            all_contours = contours_ext + contours_tree
            table_regions = set()
            min_area = 5000  # Increased minimum area
            
            for contour in all_contours:
                if cv2.contourArea(contour) > min_area and self.is_likely_table(contour, gray.shape, text_regions):
                    x, y, w, h = cv2.boundingRect(contour)
                    
                    # Check for table structure
                    roi = thresh[y:y+h, x:x+w]
                    h_kernel_size = max(w//8, 20)
                    v_kernel_size = max(h//8, 20)
                    
                    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_kernel_size, 1))
                    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, v_kernel_size))
                    
                    horizontal_lines = cv2.morphologyEx(roi, cv2.MORPH_OPEN, horizontal_kernel)
                    vertical_lines = cv2.morphologyEx(roi, cv2.MORPH_OPEN, vertical_kernel)
                    
                    # Require either lines or multiple text regions
                    has_lines = (np.sum(horizontal_lines) > roi.size * 0.001 or 
                               np.sum(vertical_lines) > roi.size * 0.001)
                    
                    text_in_region = [t for t in text_regions if 
                                    x <= t[0] <= x + w and y <= t[1] <= y + h]
                    
                    if has_lines or len(text_in_region) >= 3:  # Require at least 3 text regions
                        # Don't extend margin for headers if near page edge
                        if y > gray.shape[0] * 0.1:  # Only if not at top of page
                            y = max(0, y - self.heading_margin)
                            h = h + self.heading_margin
                        table_regions.add((x, y, w, h))

            return sorted(list(table_regions), key=lambda x: x[1])

        except Exception as e:
            print(f"Error detecting tables: {str(e)}")
            return []
    
    
    def extract_top_text_from_image(self, image):
        """
        Extract text from the top portion of an image.
        
        Args:
            image: PIL Image object containing the table
            
        Returns:
            str: Extracted text from the top portion of the image
        """
        try:
            # Convert PIL Image to numpy array for processing
            img_array = np.array(image)
            
            # Calculate the top portion (approximately 15% of the image height)
            height = img_array.shape[0]
            top_portion_height = int(height * 0.4)
            
            # Crop the top portion
            top_portion = image.crop((0, 0, image.width, top_portion_height))
            
            # Enhance the image for better text recognition
            # Convert to grayscale
            gray_image = top_portion.convert('L')
            
            # Apply adaptive thresholding
            threshold = 200
            binary_image = gray_image.point(lambda x: 0 if x < threshold else 255, '1')
            
            # Increase image size for better OCR results
            scale_factor = 2
            enlarged_image = binary_image.resize(
                (binary_image.width * scale_factor, binary_image.height * scale_factor),
                Image.Resampling.LANCZOS
            )
            
            # Perform OCR using pytesseract
            custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
            text = pytesseract.image_to_string(enlarged_image, config=custom_config)
            
            # Clean up the extracted text
            cleaned_text = ' '.join(text.split())  # Remove extra whitespace
            cleaned_text = re.sub(r'[^\w\s.-]', '', cleaned_text)  # Remove special characters except dots and hyphens
            
            return cleaned_text.strip()
            
        except Exception as e:
            print(f"Error in text extraction: {str(e)}")
            return ""
        
    
    def find_match_index(self, text, search_strings):
        """
        Find if any of the search strings appear in the text (case insensitive)
        and return the index of the matching string from the search_strings list.
        
        Parameters:
        text (str): The text to search in
        search_strings (list): List of strings to search for
        
        Returns:
        int: Index of the matching string, or -1 if no match found
        """
        # Convert text to lowercase for case-insensitive matching
        text_lower = text.lower()
        
        # Check each search string
        for idx, search_str in enumerate(search_strings):
            # Convert search string to lowercase and remove spaces
            search_str_normalized = search_str.lower().replace(" ", "")
            text_normalized = text_lower.replace(" ", "")
            
            # Check if the normalized search string is in the normalized text
            if search_str_normalized in text_normalized:
                return idx
        
        # Return -1 if no match is found
        return -1
    
        
    def extract_team_name(self, text):
        """
        Extracts the team name from a text that starts with 'Form Guide' followed by a hyphen.
        
        Parameters:
        text (str): Input text that should contain 'Form Guide - TEAM'
        
        Returns:
        str: Team name after the hyphen if format matches, empty string otherwise
        """
        # Convert to lower case for case-insensitive check of "Form Guide"
        if text.lower().startswith("form guide") and '-' in text:
            # Split by hyphen and get the last part
            parts = text.split('-')
            if len(parts) > 1:
                return parts[1].strip()
        return ""
    
    
    def get_first_word(self, text):
        """
        Extracts the first word from a given text string.
        
        Parameters:
        text (str): The input text string
        
        Returns:
        str: The first word from the text, or empty string if text is empty
        """
        # Handle empty or None input
        if not text:
            return ""
        
        # Split the text by whitespace and get the first word
        words = text.strip().split()
        
        # Return first word if exists, empty string otherwise
        return words[0] if words else ""

    
    def find_common_word_index(self, word, text_list):
        """
        Find if a word appears in any of the strings in the text_list
        and return the index of the first matching string.
        
        Parameters:
        word (str): The word to search for
        text_list (list): List of strings to search in
        
        Returns:
        int: Index of the matching string, or -1 if no match found
        """
        # Convert search word to lowercase
        word = word.lower()
        
        # Check each string in the list
        for idx, text in enumerate(text_list):
            # Convert text to lowercase and split into words
            text_words = text.lower().split()
            
            # Check if the word exists in the current text
            if word in text_words:
                return idx
                
        # Return -1 if no match is found
        return -1
    

    def is_region_overlapping(self, region, processed_regions):
        """Check if a region overlaps with any of the processed regions."""
        x1, y1, w1, h1 = region
        for px, py, pw, ph in processed_regions:
            if not (x1 + w1 < px or px + pw < x1 or y1 + h1 < py or py + ph < y1):
                return True
        return False
    
    
    def extract_text_from_image(self, image):
        """Extract text from an image using OCR."""
        try:
            text = pytesseract.image_to_string(image)
            #printing the text from the image
            # print("text from the image: ", text.strip())
            return text.strip()
        except Exception as e:
            print(f"Error extracting text from image: {str(e)}")
            return ""
    
    

    def extract_table_images(self):
        """Extract table images and save them in respective folders."""
        if not self.pages:
            self.convert_pdf_to_images()
        if not self.pages:
            return
        
        for page_num, page in enumerate(self.pages):
            try:
                print("page_number : ", page_num)
                if page_num < 2:
                    major, minor, sub = extract_headings_from_page(pdf_path, page_num+1)
                    teamName1 = ""
                    teamName2 = ""
                    for heading in major:
                        if teamName1 == "":
                            teamName1 = self.extract_team_name(heading)
                            # print(teamName1)
                        else:
                            teamName2 = self.extract_team_name(heading)
                            # print(teamName2)
                    if page_num == 0:
                        page_folder = os.path.join(self.output_dir, f"page_{page_num + 1}_summary_of_{teamName1}_vs_{teamName2}")
                        os.makedirs(page_folder, exist_ok=True)  
                    
                    if page_num == 1:
                        page_folder = os.path.join(self.output_dir, f"page_{page_num + 1}_performance_of_venue_and_top_players_{teamName1}_vs_{teamName2}")
                        os.makedirs(page_folder, exist_ok=True)  
                    # major, minor, sub = extract_headings_from_page(pdf_path, page_num+1)
                    # page_folder = os.path.join(self.output_dir, f"page_{page_num + 1}")
                    # os.makedirs(page_folder, exist_ok=True)
                else:
                    major, minor, sub = extract_headings_from_page(pdf_path, page_num+1)
                    page_folder = os.path.join(self.output_dir, f"page_{page_num+1}_{major[0]}")
                    os.makedirs(page_folder, exist_ok=True)  
                
                # page_folder = os.path.join(self.output_dir, f"page_{page_num}")
                # print("saving the folder name perfectly")
                # os.makedirs(page_folder, exist_ok=True)

                table_regions = self.detect_tables(page)
                processed_regions = []
                table_counter = 1
                
                
                i = 0
                while i < len(table_regions):
                    table_name = "None"
                    x, y, w, h = table_regions[i]
                    if self.is_region_overlapping((x, y, w, h), processed_regions):
                        i += 1
                        continue

                    current_table = page.crop((x, y, x + w, y + h))
                    if not isinstance(current_table, Image.Image):
                        raise ValueError(f"Expected a PIL.Image.Image object, got {type(current_table)} instead.")

                    # text2 = self.extract_text_from_image(current_table)
                    # print("text from the entire image: ", text2)
                    text = self.extract_top_text_from_image(current_table)
                    # print("text on the top of the image:", text)

                    if "Last 5 Encounters" in text:
                        x_min, y_min = x, y
                        x_max, y_max = x + w, y + h
                        
                        j = i + 1
                        while j < len(table_regions):
                            next_x, next_y, next_w, next_h = table_regions[j]
                            if next_y > y and abs(next_x - x) < w * 0.5:
                                x_min = min(x_min, next_x)
                                x_max = max(x_max, next_x + next_w)
                                y_max = max(y_max, next_y + next_h)
                            j += 1

                        combined_image = page.crop((x_min, y_min, x_max, y_max))
                        combined_image.save(os.path.join(page_folder, f"{table_counter+1}_last_5_encounters.png"))
                        processed_regions.append((x_min, y_min, x_max - x_min, y_max - y_min))
                        
                    else:
                        if page_num < 2:
                            # print("Major: ", major)
                            # index = next((i for i, item in enumerate(major) if re.search(re.escape(item), text, re.IGNORECASE)), -1)
                            
                            index = self.find_match_index(text, major)
                            if index == -1:
                                index = next((i for i, item in enumerate(major) if item in text), -1)
                            if index != -1:
                                table_name = major[index]
                                # print("index:", index, ":::::", "table_name:", table_name)
                                
                        elif page_num >=2:
                            # print("Sub: ", sub)
                            # index = next((i for i, item in enumerate(sub) if item in text), -1)
                            first_word = self.get_first_word(text)
                            # print(first_word)
                            index = self.find_match_index(text, sub)
                            # if index == -1:
                            #     index = next((i for i, item in enumerate(sub) if item in text), -1)
                            if index == -1:
                                index = self.find_common_word_index(first_word, sub)
                                # print(first_word, ":::::", index)
                            if(index != -1):
                                table_name = sub[index]
                                # print("index: ", index,":::::", "table_name: ", table_name)
                        current_table.save(os.path.join(page_folder, f"table_{table_counter}_{table_name}.png"))
                        processed_regions.append((x, y, w, h))
                        table_counter += 1
                    
                    i += 1

            except Exception as e:
                print(f"Error processing page {page_num + 1}: {str(e)}")
                continue

In [352]:
def process_pdf(pdf_path, output_dir="extracted_tables", heading_margin=93):
    """Process the PDF and extract tables."""
    extractor = PDFTableExtractor(pdf_path, output_dir, heading_margin)
    extractor.extract_table_images()
    print(f"Table images saved in: {output_dir}")

In [356]:
# from pdf_table_extractor_1_new import process_pdf

pdf_path = "../Pre_Match Report New.pdf"  # Path to your PDF file
output_dir = "SA vs IND"  # Directory to store extracted table images
heading_margin = 93  # Pixels above tables to include headings

# Process the PDF and extract tables
extracted_images = process_pdf(pdf_path, output_dir, heading_margin)
print(f"Extracted images saved in: {output_dir}")

page_number :  0
Head To Head
Head To Head

Form Guide - SOUTH AFRICA
Form Guide - SOUTH AFRICA
SOUTH AFRICA
Form Guide - INDIA
Form Guide - INDIA
INDIA
Venue Insights
Venue Insights

Last 5 Encounters
Last 5 Encounters

page_number :  1
Highest Successful Run Chase
Highest Successful Run Chase

Top Performance For Both Teams
Top Performance For Both Teams

page_number :  2
page_number :  3
page_number :  4
page_number :  5
page_number :  6
page_number :  7
page_number :  8
page_number :  9
page_number :  10
page_number :  11
page_number :  12
page_number :  13
page_number :  14
page_number :  15
page_number :  16
page_number :  17
page_number :  18
page_number :  19
Table images saved in: SA vs IND
Extracted images saved in: SA vs IND
