In [17]:
%load_ext autoreload
%autoreload 2

import os
import pymupdf
import json
from utils import create_text_lines, TextWord, extract_words
from detect_language import detect_language_of_document
base_dir = os.getcwd()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
pdf_path = "data/NAB/Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"
#pdf_path = "data/test/43283.pdf"
output_path = "data/predictions.json"

In [11]:
def is_digitally_born(page: pymupdf.Page) -> bool:
    bboxes = page.get_bboxlog()

    for boxType, rectangle in bboxes:
        # Empty rectangle that should be ignored occurs sometimes, e.g. SwissGeol 44191 page 37.
        if (boxType == "fill-text" or boxType == "stroke-text") and not pymupdf.Rect(rectangle).is_empty:
            return True
    return False

In [None]:
def classify_page(words, page_size):
    """
    Classifies a page as text-heavy or image-heavy based on word distribution.
    - words: List of TextWord objects
    - page_size: Tuple (width, height) of the page
    Returns:
        "text" if the page is likely a text page, otherwise "image".
    """
    if not words:
        return "image"

    text_density = len(words) / (page_size[0] * page_size[1])  # Simple density metric
    avg_word_height = sum(word.rect.height for word in words) / len(words)

    density_threshold = 0.00005  # Lower means more likely to be an image-heavy page
    height_threshold = 20  # If words are very large, the page may contain figures

    if text_density > density_threshold and avg_word_height < height_threshold:
        return "text"
    return "image"

In [None]:
with pymupdf.open(pdf_path) as doc:

    text_lines = create_text_lines(doc) 
    language = detect_language_of_document(doc)

    for page_index, page in enumerate(doc):
        page_number = page_index + 1
        page_size = (page.rect.width, page.rect.height)
        
        digitally_born = is_digitally_born(doc[page_index])
        if digitally_born:
            page_type = "image" if page.get_images() else "text"

        else:
            words = extract_words(page, page_number)
            page_type = classify_page(words, page_size)
        
        print(f"Page {page_number}: {page_type}")
        print(f"Digitally born: {digitally_born}")
        print()


[]
Page 1: text
Digitally born: True

Page 2: image
Digitally born: False

[]
Page 3: text
Digitally born: True

[]
Page 4: text
Digitally born: True

[]
Page 5: text
Digitally born: True

[]
Page 6: text
Digitally born: True

[]
Page 7: text
Digitally born: True

[(17, 0, 1772, 317, 8, 'DeviceRGB', '', 'X1', 'DCTDecode'), (18, 0, 1772, 317, 8, 'DeviceRGB', '', 'X2', 'DCTDecode'), (19, 0, 1772, 317, 8, 'DeviceRGB', '', 'X3', 'DCTDecode'), (20, 0, 1772, 317, 8, 'DeviceRGB', '', 'X4', 'DCTDecode'), (21, 0, 1772, 317, 8, 'DeviceRGB', '', 'X5', 'DCTDecode'), (22, 0, 1772, 317, 8, 'DeviceRGB', '', 'X6', 'DCTDecode'), (23, 0, 1772, 317, 8, 'DeviceRGB', '', 'X7', 'DCTDecode'), (24, 0, 1772, 316, 8, 'DeviceRGB', '', 'X8', 'DCTDecode')]
Page 8: image
Digitally born: True

[]
Page 9: text
Digitally born: True

[(30, 0, 2336, 1380, 8, 'DeviceRGB', '', 'X1', 'DCTDecode')]
Page 10: image
Digitally born: True

[(33, 0, 3000, 890, 8, 'DeviceRGB', '', 'Im0', 'DCTDecode')]
Page 11: image
Digitally born