In [1]:
%load_ext autoreload
%autoreload 2

import os
import fitz
import sys
import yaml
import os

repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if repo_root not in sys.path:
    sys.path.append(repo_root)


from src.text import create_text_lines
from src.detect_language import detect_language_of_page
from src.TOCExtractor import TOCExtractor


In [2]:
pdf_path = os.path.join(repo_root,"data/NAB/Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF")

with open(os.path.join(repo_root, "matching_params.yml"), "r") as params_file:
    matching_params = yaml.safe_load(params_file)

In [3]:
with fitz.open(pdf_path) as doc:

    text_lines=[]

    for page_number, page in enumerate(doc, start = 1):
        language = detect_language_of_page(page)

        text_lines.extend(create_text_lines(page, page_number))
    extractor = TOCExtractor(matching_params)

    # Extract the TOC
    table_of_content = extractor.extract_toc(doc,text_lines, language)

    # Output results
    if table_of_content:
        for entry in table_of_content.entries:
            print(f"Header: {entry['header']}, Page: {entry['page']}")
    else:
        print("No TOC found.")
    
    print(table_of_content)

Header: NAB 10-25 Cover, Page: 1
Header: Titelseite innen, Page: 3
Header: Copyright, Page: 4
Header: Inhaltsverzeichnis, Page: 5
Header: Tabellenverzeichnis, Page: 5
Header: Figurenverzeichnis, Page: 6
Header: 1 Einleitung und Zielsetzung, Page: 7
Header: 2 Geologie, Page: 9
Header: 2.1 Beschreibung des Untersuchungsgebietes, Page: 9
Header: 2.2 Prognose Bohrprofil, Page: 12
Header: 3 Bohrarbeiten, Page: 13
Header: 3.1 Bohrspülung, Page: 15
Header: 4 Vorgesehene Untersuchungen, Page: 17
Header: 4.1 Bohrstellen-Geologie, Page: 17
Header: 4.2 Bohrloch-Geophysik, Page: 19
Header: 4.3 Hydrogeologie und Hydrochemie, Page: 22
Header: 4.4 Geotechnik und Felsmechanik, Page: 25
Header: 5 Bohrplatz, Page: 29
Header: 6 Projektorganisation, Page: 31
Header: 7 Referenzen, Page: 33
TOC(entries=[{'header': 'NAB 10-25 Cover', 'page': 1}, {'header': 'Titelseite innen', 'page': 3}, {'header': 'Copyright', 'page': 4}, {'header': 'Inhaltsverzeichnis', 'page': 5}, {'header': 'Tabellenverzeichnis', 'page':