# Importing the libraries

In [57]:
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
import os
import pytesseract

# Making the class of the PDF Table extractor

In [58]:
class PDFTableExtractor:
    def __init__(self, pdf_path, output_dir="extracted_tables", heading_margin=200):
        self.pdf_path = pdf_path
        self.pages = None
        self.heading_margin = heading_margin
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def convert_pdf_to_images(self):
        """Convert PDF pages to images."""
        try:
            self.pages = convert_from_path(self.pdf_path, dpi=300)
            return self.pages
        except Exception as e:
            print(f"Error converting PDF to images: {str(e)}")
            return None

    def detect_text_regions(self, gray_image):
        """Detect text regions that might indicate table headers."""
        try:
            data = pytesseract.image_to_data(gray_image, output_type=pytesseract.Output.DICT)
            potential_headers = []
            
            for i in range(len(data['text'])):
                if int(data['conf'][i]) > 60:  # Filter for confident text detection
                    text = data['text'][i].strip().lower()
                    if len(text) > 3:  # Filter out very short text
                        x = data['left'][i]
                        y = data['top'][i]
                        w = data['width'][i]
                        h = data['height'][i]
                        potential_headers.append((x, y, w, h, text))
            
            return potential_headers
        except Exception as e:
            print(f"Error detecting text regions: {str(e)}")
            return []

    def is_likely_table(self, contour, image_shape, text_regions):
        """Enhanced table detection with improved filtering for headers/footers."""
        x, y, w, h = cv2.boundingRect(contour)
        area = cv2.contourArea(contour)
        rect_area = w * h
        perimeter = cv2.arcLength(contour, True)
        
        # Basic shape metrics
        circularity = 4 * np.pi * area / (perimeter * perimeter) if perimeter > 0 else 0
        rectangularity = area / rect_area if rect_area > 0 else 0
        aspect_ratio = w / h if h > 0 else 0
        relative_size = area / (image_shape[0] * image_shape[1])

        # Filter out regions that are likely headers or footers
        # Headers and footers typically span the full width of the page
        page_width = image_shape[1]
        is_full_width = w > page_width * 0.9
        
        # Headers and footers are usually at the top or bottom 10% of the page
        page_height = image_shape[0]
        is_at_edge = y < page_height * 0.1 or (y + h) > page_height * 0.9
        
        # If region is both full-width and at page edge, likely a header/footer
        if is_full_width and is_at_edge:
            return False

        # Check for table-like structure
        has_headers = False
        header_text_count = 0
        for tx, ty, tw, th, text in text_regions:
            if (x <= tx <= x + w and y <= ty <= y + h):
                has_headers = True
                header_text_count += 1

        # Require multiple text regions for a valid table
        if header_text_count < 2:
            return False

        # More lenient criteria for regions with headers
        if has_headers:
            return (
                circularity < 0.9 and
                rectangularity > 0.3 and
                0.2 < aspect_ratio < 25.0 and
                0.01 < relative_size < 0.8 and  # Adjusted size constraints
                w > 50 and h > 50 and  # Minimum dimensions
                w < page_width * 0.95  # Should not span entire page width
            )
        else:
            # Stricter criteria for regions without headers
            return (
                circularity < 0.7 and
                rectangularity > 0.6 and
                0.5 < aspect_ratio < 15.0 and
                0.02 < relative_size < 0.7 and
                w > 100 and h > 50 and
                w < page_width * 0.9
            )

    def detect_tables(self, page_image):
        """Enhanced table detection with improved filtering."""
        try:
            img_array = np.array(page_image)
            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
            
            # Detect text regions first
            text_regions = self.detect_text_regions(gray)
            
            # Multiple threshold approaches
            thresh_adaptive = cv2.adaptiveThreshold(
                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 15, 3
            )
            
            _, thresh_binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            thresh = cv2.bitwise_or(thresh_adaptive, thresh_binary)
            
            # Morphological operations
            kernel_small = np.ones((3, 3), np.uint8)
            kernel_large = np.ones((5, 5), np.uint8)
            
            thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel_small)
            thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel_small)
            thresh = cv2.dilate(thresh, kernel_large, iterations=1)

            # Find contours
            contours_ext, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            contours_tree, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
            
            all_contours = contours_ext + contours_tree
            table_regions = set()
            min_area = 5000  # Increased minimum area
            
            for contour in all_contours:
                if cv2.contourArea(contour) > min_area and self.is_likely_table(contour, gray.shape, text_regions):
                    x, y, w, h = cv2.boundingRect(contour)
                    
                    # Check for table structure
                    roi = thresh[y:y+h, x:x+w]
                    h_kernel_size = max(w//8, 20)
                    v_kernel_size = max(h//8, 20)
                    
                    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_kernel_size, 1))
                    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, v_kernel_size))
                    
                    horizontal_lines = cv2.morphologyEx(roi, cv2.MORPH_OPEN, horizontal_kernel)
                    vertical_lines = cv2.morphologyEx(roi, cv2.MORPH_OPEN, vertical_kernel)
                    
                    # Require either lines or multiple text regions
                    has_lines = (np.sum(horizontal_lines) > roi.size * 0.001 or 
                               np.sum(vertical_lines) > roi.size * 0.001)
                    
                    text_in_region = [t for t in text_regions if 
                                    x <= t[0] <= x + w and y <= t[1] <= y + h]
                    
                    if has_lines or len(text_in_region) >= 3:  # Require at least 3 text regions
                        # Don't extend margin for headers if near page edge
                        if y > gray.shape[0] * 0.1:  # Only if not at top of page
                            y = max(0, y - self.heading_margin)
                            h = h + self.heading_margin
                        table_regions.add((x, y, w, h))

            return sorted(list(table_regions), key=lambda x: x[1])

        except Exception as e:
            print(f"Error detecting tables: {str(e)}")
            return []

    def extract_text_from_image(self, image):
        """Extract text from an image using OCR."""
        try:
            text = pytesseract.image_to_string(image)
            return text.strip()
        except Exception as e:
            print(f"Error extracting text from image: {str(e)}")
            return ""

    def is_region_overlapping(self, region, processed_regions):
        """Check if a region overlaps with any of the processed regions."""
        x1, y1, w1, h1 = region
        for px, py, pw, ph in processed_regions:
            if not (x1 + w1 < px or px + pw < x1 or y1 + h1 < py or py + ph < y1):
                return True
        return False

    def extract_table_images(self):
        """Extract table images and save them in respective folders."""
        if not self.pages:
            self.convert_pdf_to_images()
        if not self.pages:
            return

        for page_num, page in enumerate(self.pages):
            try:
                
                page_folder = os.path.join(self.output_dir, f"page_{page_num + 1}")
                os.makedirs(page_folder, exist_ok=True)

                table_regions = self.detect_tables(page)
                processed_regions = []
                table_counter = 1

                i = 0
                while i < len(table_regions):
                    x, y, w, h = table_regions[i]
                    
                    if self.is_region_overlapping((x, y, w, h), processed_regions):
                        i += 1
                        continue

                    current_table = page.crop((x, y, x + w, y + h))
                    text = self.extract_text_from_image(current_table)
                    
                    if "Last 5 Encounters" in text:
                        x_min, y_min = x, y
                        x_max, y_max = x + w, y + h
                        
                        j = i + 1
                        while j < len(table_regions):
                            next_x, next_y, next_w, next_h = table_regions[j]
                            if next_y > y and abs(next_x - x) < w * 0.5:
                                x_min = min(x_min, next_x)
                                x_max = max(x_max, next_x + next_w)
                                y_max = max(y_max, next_y + next_h)
                            j += 1

                        combined_image = page.crop((x_min, y_min, x_max, y_max))
                        combined_image.save(os.path.join(page_folder, "last_5_encounters_combined.png"))
                        processed_regions.append((x_min, y_min, x_max - x_min, y_max - y_min))
                        
                    else:
                        current_table.save(os.path.join(page_folder, f"table_{table_counter}.png"))
                        processed_regions.append((x, y, w, h))
                        table_counter += 1
                    
                    i += 1

            except Exception as e:
                print(f"Error processing page {page_num + 1}: {str(e)}")
                continue

# Using the Class

In [59]:
def process_pdf(pdf_path, output_dir="extracted_tables", heading_margin=90):
    """Process the PDF and extract tables."""
    extractor = PDFTableExtractor(pdf_path, output_dir, heading_margin)
    extractor.extract_table_images()
    print(f"Table images saved in: {output_dir}")

# Saving the PDF

In [60]:
# from pdf_table_extractor_1_new import process_pdf

pdf_path = "../Pre_Match Report New_removed (1).pdf"  # Path to your PDF file
output_dir = "my_tables"  # Directory to store extracted table images
heading_margin = 90  # Pixels above tables to include headings

# Process the PDF and extract tables
extracted_images = process_pdf(pdf_path, output_dir, heading_margin)
print(f"Extracted images saved in: {output_dir}")

Table images saved in: my_tables
Extracted images saved in: my_tables
