In [None]:
%load_ext autoreload
%autoreload 2

import os
import pymupdf
import json
import sys

repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(os.path.join(repo_root, "src"))

from text import extract_words
from bounding_box import cluster_drawings
from utils import is_digitally_born, classify_text_density, process_documents


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:

filename ="Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"

pdf_path = os.path.join(repo_root, "data/NAB", filename)
out_dir = os.path.join(repo_root, "data/test", os.path.splitext(filename)[0])


In [4]:
def classify_digital(page):
    if page.get_images():
        return " image"

    drawings = page.get_drawings()
    if drawings:
        clustered_boxes = cluster_drawings(drawings)

        for cluster in clustered_boxes: #filter out small bboxes
            if cluster.get_area() > 100000:
                return " drawing"
    return " text"

In [5]:
##classify pdf based on image, drawing and text
def classify_pages(doc):
    results ={}
    for page_index, page in enumerate(doc):
        page_number = page_index + 1
        page_size = (page.rect.width, page.rect.height)
        
        digitally_born = is_digitally_born(doc[page_index])

        if digitally_born:
            page_type = classify_digital(page)
        
        else:
            words = extract_words(page, page_number)
            page_type = classify_text_density(words, page_size)
        
        results[page_number] = { "page_type":page_type,
                                "digitally_born": digitally_born}
            
    return results


In [6]:
predictions = process_documents(pdf_path, classify_pages)
with open(os.path.join(repo_root,"data/predictions.json"), 'w', encoding='utf-8') as json_file:
        json.dump(predictions, json_file, indent=4, ensure_ascii=False)