In [1]:
%load_ext autoreload
%autoreload 2

import os
import pymupdf

from text import extract_words
import json
from bounding_box import cluster_drawings
from utils import is_digitally_born, classify_wordpos, classify_text_density, process_documents


In [2]:
base_dir = os.getcwd()
filename ="Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"

pdf_path = os.path.join(base_dir, "data/NAB", filename)
out_dir = os.path.join(base_dir, "data/test", os.path.splitext(filename)[0])


In [3]:
def classify_digital(page):
    if page.get_images():
        return " image"

    drawings = page.get_drawings()
    if drawings:
        clustered_boxes = cluster_drawings(drawings)

        for cluster in clustered_boxes: #filter out small bboxes
            if cluster.get_area() > 100000:
                return " drawing"
    return " text"

In [4]:
##classify pdf based on image, drawing and text
def classify_pages(doc):
    results ={}
    for page_index, page in enumerate(doc):
        page_number = page_index + 1
        page_size = (page.rect.width, page.rect.height)
        
        digitally_born = is_digitally_born(doc[page_index])

        if digitally_born:
            page_type = classify_digital(page)
        
        else:
            words = extract_words(page, page_number)
            page_type = classify_text_density(words, page_size)
        
        results[page_number] = { "page_type":page_type,
                                "digitally_born": digitally_born}
            
    return results


In [None]:
predictions = process_documents(pdf_path, classify_pages)
with open(os.path.join(base_dir,"data/predictions.json"), 'w', encoding='utf-8') as json_file:
        json.dump(predictions, json_file, indent=4, ensure_ascii=False)

{'Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF': {1: {'page_type': ' drawing', 'digitally_born': True}, 2: {'page_type': 'No text', 'digitally_born': False}, 3: {'page_type': ' text', 'digitally_born': True}, 4: {'page_type': ' text', 'digitally_born': True}, 5: {'page_type': ' text', 'digitally_born': True}, 6: {'page_type': ' text', 'digitally_born': True}, 7: {'page_type': ' text', 'digitally_born': True}, 8: {'page_type': ' image', 'digitally_born': True}, 9: {'page_type': ' text', 'digitally_born': True}, 10: {'page_type': ' image', 'digitally_born': True}, 11: {'page_type': ' image', 'digitally_born': True}, 12: {'page_type': ' image', 'digitally_born': True}, 13: {'page_type': ' text', 'digitally_born': True}, 14: {'page_type': ' drawing', 'digitally_born': True}, 15: {'page_type': ' text', 'digitally_born': True}, 16: {'page_type': ' text', 'digitally_born': True}, 17: {'page_type': ' text', 'digitally_born': True}, 18: {'page_type': ' text', 'digit