In [1]:
import fitz
import os
base_dir = os.getcwd()

In [3]:
pdf_path = "data/NAB/Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"
pdf_path = "data/NAB/Berichte_NAB 20-001_QBO Trüllikon-Rudolfingen Appendices/"

def text_from_document(file_path) -> dict:
    """ Retrieve text per page from a single pdf file
    Returns dictionary with pagenumber as key and all text on that page as item"""
    doc = fitz.open(file_path)

    page_text= {}
    for page_index, page in enumerate(doc):
        page_number = page_index + 1
        text = page.get_text()
        
        page_text[page_number]= text
    
    return(page_text)

def process_documents(input_path):
    """ Retrieves text from input file or folder and returns dictionary"""
    
    results = {}
    
    if os.path.isfile(input_path):
        results[os.path.basename(input_path)] = text_from_document(input_path)

    elif os.path.isdir(input_path):
        for filename in os.listdir(input_path):
            if filename.lower().endswith('.pdf'):
                file_path = os.path.join(input_path, filename)
                results[filename] = text_from_document(file_path)
    else:
        print(f"Input path is invalid: {input_path}")
    
    return results


In [4]:
document_text =process_documents(pdf_path)

In [5]:
for page_num, text in document_text.items():
    print(f"Document: {page_num}:\n {text}")

Document: App-B2_Geological Profile.pdf:
 {1: 'Lower \nFreshwater\nMolasse\n(USM)\n0.9\n1.6\n2.7\n6.9\n411.9\n411.2\n410.1\n405.9\n404.6\n404.1\n399.8\n394.6\n389.4\n382.0\n378.8\n374.6\n373.8\n324.8\n325.8\n327.8\n332.8\n337.0\n339.9\n339.4\n341.7\n342.2\n342.8\n346.8\n351.8\n358.0\n358.8\n361.2\n363.8\n366.9\n367.2\n370.6\n371.3\n332.3\n323.4\n322.8\n321.3\n319.8\n318.3\n318.0\n316.3\n316.2\n314.4\n313.8\nCover sediments, \nw. soil formation\nFluvial/deltaic \ngravels\nLacustrine\nsands and fines \n(coarsening \nupward trend)\nGlacilacustrine to \nlacustrine fines\nGlacilacustrine \nfines\n(with dropstones)\nGlacilacustrine \nsediments\n(base possibly \nglacideltaic,\nfining upward \ntrend)\nGlacigenic \ndiamicts / tills\n8.2\n8.7\n13.0\n18.2\n19.6\n19.8\n20.3\n20.4\n22.5\n22.3\n18.4\n23.4\n30.8\n30.7\n29.5\n29.8\n27.4\n27.6\n26.5\n26.8\n25.0\n25.3\n34.0\n38.2\n39.0\n42.2\n41.5\n87.0\n80.5\n75.8\n73.4\n80.0\n72.9\n71.1\n70.6\n70.0\n66.0\n61.0\n51.6\n49.0\n45.9\n45.6\n54.0\n54.8\n85.0

In [6]:
## save to json file
import json
output_path = "data/text_files/text_files.json"

with open(output_path, "w", encoding = "utf-8") as file:
    json.dump(document_text, file)