In [1]:
%load_ext autoreload
%autoreload 2

import os
import pymupdf
from text import create_text_lines, TextWord, extract_words
from detect_language import detect_language_of_document
from bounding_box import expand_bbox,bbox_overlap,merge_bounding_boxes


In [32]:
base_dir = os.getcwd()
filename ="Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"

pdf_path = os.path.join(base_dir, "data/NAB", filename)

In [24]:
def is_digitally_born(page: pymupdf.Page) -> bool:
    bboxes = page.get_bboxlog()

    for boxType, rectangle in bboxes:
        # Empty rectangle that should be ignored occurs sometimes, e.g. SwissGeol 44191 page 37.
        if (boxType == "fill-text" or boxType == "stroke-text") and not pymupdf.Rect(rectangle).is_empty:
            return True
    return False

In [40]:
def classify_by_text_density(words, page_size):
    """
    Classifies a page as text-heavy or image-heavy based on word distribution.
    - words: List of TextWord objects
    - page_size: Tuple (width, height) of the page
    Returns:
        "text" if the page is likely a text page, otherwise "image".
    """
    if not words:
        return "No Words on Page"

    text_density = len(words) / (page_size[0] * page_size[1])  # Simple density 
    avg_word_height = sum(word.rect.height for word in words) / len(words)

    density_threshold = 0.00005  # Lower means more likely to be an image-heavy page
    height_threshold = 20  # If words are very large, the page may contain figures

    if text_density > density_threshold and avg_word_height < height_threshold:
        return "text"
    return "image"

In [54]:
import numpy as np

def classify_textpos(words: list[TextWord]):
    """Classifies text structure on page based on distribution."""
    
    if not words:
        print( "Unknown")
        return

    # Extract Y-axis positions and widths
    y_positions = np.array([word.rect.y0 for word in words])
    x_positions = np.array([word.rect.x0 for word in words])
    widths = np.array([word.rect.x1 - word.rect.x0 for word in words])
    heights = np.array([word.rect.y1 - word.rect.y0 for word in words])
    
    # Compute spacing bewtween words
    y_spacing = np.diff(np.sort(y_positions))  # Vertical distances
    x_spacing = np.diff(np.sort(x_positions))  # Horizontal distancs

    median_y_spacing = np.median(y_spacing) if len(y_spacing) > 0 else 0
    median_x_spacing = np.median(x_spacing) if len(x_spacing) > 0 else 0
    width_std = np.std(widths)
    height_std = np.std(heights)

    # Criteria for classification
    is_structured = median_y_spacing < 15  # Small row spacing → structured layout
    is_narrow  = np.median(widths) < 100  # Small width suggests tabular data
    is_uniform_width = width_std < 10  # Consistent column width → table-like
    
    is_uniform_height = height_std < 10 
    print(f"median_y_spacing: {median_y_spacing }")
    print(f"median width :{np.median(widths) }")
    print(f"width_std: {width_std }")
    print(f"median_x_spacing :{median_x_spacing }")
    print(f"height_std:{height_std}")
    print()

In [35]:
def cluster_drawings(drawings):
    """
    Cluster overlapping drawings into groups
    """
    clusters = []
    expanded_bboxes = [expand_bbox(pymupdf.Rect(d["rect"]), 5) for d in drawings]    
    for bbox in expanded_bboxes:
        added = False
        
        for cluster in clusters:
            if any(bbox_overlap(bbox, pymupdf.Rect(existing_bbox)) for existing_bbox in cluster):
                cluster.append(bbox)
                added = True
                break
        
        if not added:
            clusters.append([bbox])  # new cluster
    
    # Merge bounding boxes inside each cluster
    merged_clusters = [merge_bounding_boxes(cluster) for cluster in clusters]
    
    return merged_clusters

In [55]:
with pymupdf.open(pdf_path) as doc:
    
    for page_index,page in enumerate(doc):
        page_number = page_index +1

        drawings = page.get_drawings()

        # if not drawings:
        #     continue
        # 
        # clustered_boxes = cluster_drawings(drawings)
        # print(f"Page number:{page_number}, drawings with length {len(drawings)}, clustered {len(clustered_boxes)}")
        # 
        # for cluster in clustered_boxes: #filter out small bboxes
        #     if cluster.get_area() > 100000:
        #         print("valid drawing size")
        print(f"page number: {page_number}")
        words = extract_words(page, page_number)
        classify_textpos(words,)
      
        
        

page number: 1
median_y_spacing: 0.0
median width :40.39642333984375
width_std: 61.94029531954457
median_x_spacing :4.016693115234375
height_std:12.810119532907168

page number: 2
Unknown
page number: 3
median_y_spacing: 0.0
median width :42.86698913574219
width_std: 53.28196824648505
median_x_spacing :3.397308349609375
height_std:11.10991055147811

page number: 4
median_y_spacing: 0.0
median width :27.64801025390625
width_std: 19.88149430955991
median_x_spacing :2.860107421875
height_std:6.4672553941189925e-06

page number: 5
median_y_spacing: 0.0
median width :23.231475830078125
width_std: 127.35898487953577
median_x_spacing :0.0113983154296875
height_std:0.7397917521457982

page number: 6
median_y_spacing: 0.0
median width :25.04754638671875
width_std: 63.67378070228419
median_x_spacing :1.24456787109375
height_std:0.4579388337095658

page number: 7
median_y_spacing: 0.0
median width :25.045379638671875
width_std: 20.105428833467094
median_x_spacing :0.8567047119140625
height_std:0.

In [37]:
def classify_digital(page):
    if page.get_images():
        return "image"

    drawings = page.get_drawings()
    if drawings:
        clustered_boxes = cluster_drawings(drawings)
        print(f"Page number:{page_number}, drawings with length {len(drawings)}, clustered {len(clustered_boxes)}")

        for cluster in clustered_boxes: #filter out small bboxes
            if cluster.get_area() > 100000:
                return "drawing"
    return "text"

In [None]:
with pymupdf.open(pdf_path) as doc:

    text_lines = create_text_lines(doc) 
    language = detect_language_of_document(doc)

    for page_index, page in enumerate(doc):
        page_number = page_index + 1
        page_size = (page.rect.width, page.rect.height)
        
        digitally_born = is_digitally_born(doc[page_index])

        if digitally_born:
            page_type = classify_digital(page)
        
        else:
            words = extract_words(page, page_number)
            page_type = classify_by_text_density(words, page_size)
        
        print(f"Page {page_number}: {page_type}")
        print(f"Digitally born: {digitally_born}")
        print()


Page number:1, drawings with length 7, clustered 2
Page 1: drawing
Digitally born: True

Page 2: blank page
Digitally born: False

Page number:3, drawings with length 6, clustered 1
Page 3: text
Digitally born: True

Page 4: text
Digitally born: True

Page 5: text
Digitally born: True

Page 6: text
Digitally born: True

Page 7: text
Digitally born: True

Page 8: image
Digitally born: True

Page 9: text
Digitally born: True

Page 10: image
Digitally born: True

Page 11: image
Digitally born: True

Page 12: image
Digitally born: True

Page 13: text
Digitally born: True

Page number:14, drawings with length 44, clustered 7
Page 14: drawing
Digitally born: True

Page 15: text
Digitally born: True

Page 16: text
Digitally born: True

Page number:17, drawings with length 1, clustered 1
Page 17: text
Digitally born: True

Page 18: text
Digitally born: True

Page 19: text
Digitally born: True

Page 20: text
Digitally born: True

Page 21: text
Digitally born: True

Page 22: image
Digitally born