In [None]:
%load_ext autoreload
%autoreload 2

import os
import pymupdf
import numpy as np

from text import  TextWord, extract_words
from detect_language import detect_language_of_document
from bounding_box import cluster_drawings


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
base_dir = os.getcwd()
filename ="Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"
#filename = "ntb88-007.pdf"

pdf_path = os.path.join(base_dir, "data/NAB", filename)
out_dir = os.path.join(base_dir, "data/test", os.path.splitext(filename)[0])


In [8]:
def is_digitally_born(page: pymupdf.Page) -> bool:
    bboxes = page.get_bboxlog()

    for boxType, rectangle in bboxes:
        # Empty rectangle that should be ignored occurs sometimes, e.g. SwissGeol 44191 page 37.
        if (boxType == "fill-text" or boxType == "stroke-text") and not pymupdf.Rect(rectangle).is_empty:
            return True
    return False

In [14]:
def classify_by_text_density(words, page_size):
    """
    Classifies a page as text-heavy or image-heavy based on word distribution.
    - words: List of TextWord objects
    - page_size: Tuple (width, height) of the page
    Returns:
        "text" if the page is likely a text page, otherwise "image".
    """
    if not words:
        return "No Words on Page"

    text_density = len(words) / (page_size[0] * page_size[1])  # Simple density 
    avg_word_height = sum(word.rect.height for word in words) / len(words)

    density_threshold = 0.0001  # Lower means more likely to be an image-heavy page
    height_threshold = 20  # If words are very large, the page may contain figures

    if text_density > density_threshold and avg_word_height < height_threshold:
        return "text"
    return "image"

In [10]:

def classify_textpos(words: list[TextWord]):
    """Classifies text structure on page based on distribution."""
    
    if not words:
        print( "Unknown")
        return

    # Extract Y-axis positions and widths
    y_positions = np.array([word.rect.y0 for word in words])
    x_positions = np.array([word.rect.x0 for word in words])
    widths = np.array([word.rect.x1 - word.rect.x0 for word in words])
    heights = np.array([word.rect.y1 - word.rect.y0 for word in words])
    
    # Compute spacing bewtween words
    y_spacing = np.diff(np.sort(y_positions))  # Vertical distances
    x_spacing = np.diff(np.sort(x_positions))  # Horizontal distancs

    median_y_spacing = np.median(y_spacing) if len(y_spacing) > 0 else 0
    median_x_spacing = np.median(x_spacing) if len(x_spacing) > 0 else 0
    width_std = np.std(widths)
    height_std = np.std(heights)
     
    print(f"median_y_spacing: {median_y_spacing }")
    print(f"median width :{np.median(widths) }")
    print(f"width_std: {width_std }")
    print(f"median_x_spacing :{median_x_spacing }")
    print(f"height_std:{height_std}")
    print()

In [20]:
#classify page based on text
with pymupdf.open(pdf_path) as doc:
    for page_index,page in enumerate(doc):
        page_number = page_index +1
        page_size = (page.rect.width, page.rect.height)
        
        words = extract_words(page, page_number)
        print(classify_by_text_density(words,page_size))
        #print(classify_textpos(words))

image
No Words on Page
image
text
text
text
text
image
text
image
image
text
text
text
text
text
text
text
text
text
text
text
text
text
text
text
image
No Words on Page
image
No Words on Page
text
image
text
text


In [25]:
def classify_digital(page):
    if page.get_images():
        return "image"

    drawings = page.get_drawings()
    if drawings:
        clustered_boxes = cluster_drawings(drawings)

        for cluster in clustered_boxes: #filter out small bboxes
            if cluster.get_area() > 100000:
                return "drawing"
    return "text"

In [26]:
##classify pdf based on image, drawing and text
with pymupdf.open(pdf_path) as doc:

    language = detect_language_of_document(doc)

    for page_index, page in enumerate(doc):
        page_number = page_index + 1
        page_size = (page.rect.width, page.rect.height)
        
        digitally_born = is_digitally_born(doc[page_index])

        if digitally_born:
            page_type = classify_digital(page)
        
        else:
            words = extract_words(page, page_number)
            page_type = classify_by_text_density(words, page_size)
        
        print(f"Page {page_number}: {page_type}")
        #print(f"Digitally born: {digitally_born}")
        #print()


Page 1: drawing
Page 2: No Words on Page
Page 3: text
Page 4: text
Page 5: text
Page 6: text
Page 7: text
Page 8: image
Page 9: text
Page 10: image
Page 11: image
Page 12: image
Page 13: text
Page 14: drawing
Page 15: text
Page 16: text
Page 17: text
Page 18: text
Page 19: text
Page 20: text
Page 21: text
Page 22: image
Page 23: drawing
Page 24: text
Page 25: text
Page 26: text
Page 27: image
Page 28: No Words on Page
Page 29: image
Page 30: No Words on Page
Page 31: text
Page 32: image
Page 33: text
Page 34: text
