In [60]:
%load_ext autoreload
%autoreload 2

import os
import fitz
import json
import yaml
import regex
import re

from utils import  TextWord,create_text_lines, TextLine
from detect_language import detect_language_of_document
from keyword_finding import get_keywords_by_language, find_keywords_in_lines, TOC, extract_by_bookmarks

base_dir = os.getcwd()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
pdf_path = "data/NAB/Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"
output_path = "data/predictions.json"

with open(os.path.join(base_dir, "matching_params.yml"), "r") as params_file:
    matching_params = yaml.safe_load(params_file)

In [62]:
def extract_by_keywords( text_lines: list[TextLine], found_keywords: list[dict]) : ##rmak more robust

    toc_entries = []
    toc_patterns = [
    {
        "pattern": regex.compile(
            r"^(\d+(\.\d+)*)\s+([\p{L}\p{M}\p{N}\p{P}\-\s]+)\s*(\.+|\s{2,})\s*([ivxlcdm]+|\d+)$",
            flags=regex.VERBOSE | regex.IGNORECASE,
        ),
        "header_group": 3,
        "page_group": 5,
    },
    {
        "pattern": regex.compile(
            r"^([\p{L}\p{M}\p{N}\p{P}\-\s]+)\s*(\.+|\s{2,})\s*([ivxlcdm]+|\d+)$",
            flags=regex.VERBOSE | regex.IGNORECASE,
        ),
        "header_group": 1,
        "page_group": 3,
    },
    {
        "pattern": regex.compile(
            r"^(\d+(\.\d+)*)\s+([\p{L}\p{M}\p{N}\p{P}\-\s]+)\s+([ivxlcdm]+|\d+)$",
            flags=regex.VERBOSE | regex.IGNORECASE,
        ),
        "header_group": 3,
        "page_group": 4,
    },
    {
        "pattern": regex.compile(
            r"^([\p{L}\p{M}\p{N}\p{P}\-\s]+)\s+([ivxlcdm]+|\d+)$",
            flags=regex.VERBOSE | regex.IGNORECASE,
        ),
        "header_group": 1,
        "page_group": 2,
    },
]

    for keyword_entry in found_keywords:
        line_index = text_lines.index(keyword_entry["line"])
        break  # only first keyword

     # Process subsequent lines starting from the keyword's line
    for line in text_lines[line_index+1:]:
        
        line_matched = False

        for entry in toc_patterns:
            toc_pattern = entry["pattern"]
            match = toc_pattern.match(line.line_text())
            if match:
                header = match.group(entry["header_group"]).strip()
                page = match.group(entry["page_group"]).strip()
                cleaned_header = clean_header(header)
                toc_entries.append({"header": cleaned_header, "page": page})
                line_matched = True
                break

        if not line_matched:
            print(f"Unmatched line: {line.line_text()}")
            break
    return TOC(entries=toc_entries) if toc_entries else None

def clean_header(header: str) -> str:
    return regex.sub(r"\.{2,}", "", header).strip()

In [66]:
with fitz.open(pdf_path) as doc:

    text_lines = create_text_lines(doc) 
    language = detect_language_of_document(doc)
    table_of_content = extract_by_bookmarks(doc)

    if not table_of_content:
        keywords = get_keywords_by_language(language, matching_params)
    
        found_keywords =find_keywords_in_lines(text_lines, keywords)
        if found_keywords:
            table_of_content =extract_by_keywords(text_lines, found_keywords) 
    
    print(table_of_content)

TOC(entries=[{'heading': 'NAB 10-25 Cover', 'page': 1, 'level': 1}, {'heading': 'Titelseite innen', 'page': 3, 'level': 1}, {'heading': 'Copyright', 'page': 4, 'level': 1}, {'heading': 'Inhaltsverzeichnis', 'page': 5, 'level': 1}, {'heading': 'Tabellenverzeichnis', 'page': 5, 'level': 1}, {'heading': 'Figurenverzeichnis', 'page': 6, 'level': 1}, {'heading': '1 Einleitung und Zielsetzung', 'page': 7, 'level': 1}, {'heading': '2 Geologie', 'page': 9, 'level': 1}, {'heading': '2.1 Beschreibung des Untersuchungsgebietes', 'page': 9, 'level': 2}, {'heading': '2.2 Prognose Bohrprofil', 'page': 12, 'level': 2}, {'heading': '3 Bohrarbeiten', 'page': 13, 'level': 1}, {'heading': '3.1 Bohrspülung', 'page': 15, 'level': 2}, {'heading': '4 Vorgesehene Untersuchungen', 'page': 17, 'level': 1}, {'heading': '4.1 Bohrstellen-Geologie', 'page': 17, 'level': 2}, {'heading': '4.2 Bohrloch-Geophysik', 'page': 19, 'level': 2}, {'heading': '4.3 Hydrogeologie und Hydrochemie', 'page': 22, 'level': 2}, {'head

In [5]:
with fitz.Document(pdf_path) as doc:
    language = detect_language_of_document(doc)

    keywords = get_keywords_by_language(language, matching_params)
    table_of_content = extract_by_bookmarks(doc)
    found_keywords = []
    for page_index, page in enumerate(doc):
        page_number = page_index + 1
        words = []
        words_by_line = {}

        for x0, y0, x1, y1, word, block_no, line_no, _word_no in page.get_text("words"):
            rect = fitz.Rect(x0, y0, x1, y1) * page.rotation_matrix
            text_word = TextWord(rect= rect, text = word, page = page_number)
            words.append(text_word)
                
    table_of_content=table_of_content.to_dict()

In [41]:
with open(output_path, "w", encoding = "utf-8") as file:
    json.dump(table_of_content, file)

In [16]:
def show_image(item, title=""):
    DPI = 150  # use this resolution
    import numpy as np
    import matplotlib.pyplot as plt

    # %matplotlib inline
    pix = item.get_pixmap(dpi=DPI)
    img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
    plt.figure(dpi=DPI)  # set the figure's DPI
    plt.title(title)  # set title of image
    _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))

if not hasattr(fitz.Page, "find_tables"):
    raise RuntimeError("This PyMuPDF version does not support the table feature")

In [93]:
with fitz.Document(pdf_path) as doc:
    page = doc[11]
    tabs = page.find_tables()  # detect the tables
    for i,tab in enumerate(tabs):  # iterate over all tables
        for cell in tab.header.cells:
            page.draw_rect(cell,color=fitz.pdfcolor["red"],width=0.3)
        page.draw_rect(tab.bbox,color=fitz.pdfcolor["green"])
        print(f"Table {i} column names: {tab.header.names}, external: {tab.header.external}")
        
    #show_image(page, f"Table & Header BBoxes")