In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
import deepdoctection as dd
import pandas as pd
from typing import List, Dict, Any
import json

[32m[0721 13:27.24 @file_utils.py:31][0m  [32mINF[0m  [97mPyTorch version 1.13.1+cu117 available.[0m
[32m[0721 13:27.24 @file_utils.py:69][0m  [32mINF[0m  [97mDisabling Tensorflow because USE_TORCH is set[0m
[32m[0721 13:27.25 @font_manager.py:1639][0m  [32mINF[0m  [97mgenerated new fontManager[0m


In [52]:
def inspect_pages_data(pages_data, max_pages=2, max_chunks=3, max_text_len=1000, max_rows=50, max_cols=10):
    print(f"Anzahl Seiten: {len(pages_data)}")
    if len(pages_data) == 0:
        print("Keine Seiten im Datenobjekt.")
        return

    for i, page in enumerate(pages_data[:max_pages]):
        print(f"\n=== Seite {i+1} ===")
        print(f"Typ: {type(page)}")
        print(f"Keys: {list(page.keys())}")
        print(f"Seiten-Nummer: {page.get('page_number')}")
        print(f"Dateiname: {page.get('file_name')}")
        print(f"Text (erste {max_text_len} Zeichen): {page.get('text', '')[:max_text_len]!r}")

        chunks = page.get("chunks", [])
        print(f"Anzahl Chunks: {len(chunks)}")
        for j, chunk in enumerate(chunks[:max_chunks]):
            if isinstance(chunk, tuple) and len(chunk) == 2:
                bbox, text = chunk
                print(f"  Chunk {j+1}: bbox={bbox}, text={text[:max_text_len]!r}")
            else:
                print(f"  Chunk {j+1}: {str(chunk)[:max_text_len]!r}")

        tables = page.get("tables", [])
        print(f"Anzahl Tabellen: {len(tables)}")
        for k, table in enumerate(tables):
            print(f"\n  Tabelle {k+1}: Typ={type(table)} Größe={table.shape if hasattr(table, 'shape') else 'unbekannt'}")
            if hasattr(table, "head"):
                # Ausgabe der ersten max_rows Zeilen, max_cols Spalten, als Text
                print(table.iloc[:max_rows, :max_cols].to_string(index=False))
            else:
                print(str(table)[:max_text_len])

In [2]:
def table_to_dataframe(table) -> pd.DataFrame:
    """Konvertiert eine DeepDoctection-Tabelle in ein Pandas-DataFrame."""
    n_rows = table.number_of_rows
    n_cols = table.number_of_columns
    cells = table.cells

    cell_map = {}
    for cell in cells:
        r = getattr(cell, "row_number", None)
        c = getattr(cell, "column_number", None)
        text = getattr(cell, "text", "")
        if r is not None and c is not None:
            cell_map[(r, c)] = text

    rows = []
    for r in range(n_rows):
        row = []
        for c in range(n_cols):
            row.append(cell_map.get((r, c), ""))
        rows.append(row)

    return pd.DataFrame(rows)

In [3]:
def extract_page_data(page) -> Dict[str, Any]:
    """Extrahiert alle wichtigen Inhalte aus einer einzelnen Seite."""
    page_info = {
        "page_number": page.page_number,
        "file_name": page.file_name,
        "document_id": page.document_id,
        "image_id": page.image_id,
        "width": page.width,
        "height": page.height,
        "text": page.text,
        "chunks": [],
        "tables": []
    }

    # Text-Chunks (aus Layoutanalyse)
    if page.chunks:
        for chunk in page.chunks:
            if isinstance(chunk, tuple) and len(chunk) == 2:
                bbox, text = chunk
                page_info["chunks"].append({
                    "bbox": bbox,
                    "text": text
                })

    # Tabellen als DataFrames
    if page.tables:
        for table in page.tables:
            df_table = table_to_dataframe(table)
            page_info["tables"].append(df_table)

    return page_info

In [4]:
def analyze_pdf(path: str) -> List[Dict[str, Any]]:
    """Analysiert das PDF und liefert eine Liste von Seiteninformationen."""
    analyzer = dd.get_dd_analyzer()
    df = analyzer.analyze(path=path)

    pages = []
    for page in df:
        page_data = extract_page_data(page)
        pages.append(page_data)

    df.reset_state()
    return pages

In [6]:
def export_pages_data_as_json(pages_data, output_file):
    output = []
    for page in pages_data:
        page_entry = {
            "page_number": page.get("page_number"),
            "file_name": page.get("file_name"),
            "document_id": page.get("document_id"),
            "image_id": page.get("image_id"),
            "width": page.get("width"),
            "height": page.get("height"),
            "text": page.get("text"),
            "tables": []
        }

        for df in page.get("tables", []):
            csv_str = df.to_csv(index=False)
            page_entry["tables"].append(csv_str)

        output.append(page_entry)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

In [7]:
PDF_PATH = "/repo/notebooks/data/2024-nachhaltigkeitsbericht_tab.pdf"
pages_data = list(analyze_pdf(PDF_PATH))
print(f"{len(pages_data)} Seiten analysiert.")

[32m[0721 12:24.48 @dd.py:129][0m  [32mINF[0m  [97mConfig: 
 {'DEVICE': device(type='cpu'),
 'LANGUAGE': None,
 'LAYOUT_LINK': {'CHILD_CATEGORIES': [<LayoutType.CAPTION>],
                 'PARENTAL_CATEGORIES': [<LayoutType.FIGURE>, <LayoutType.TABLE>]},
 'LAYOUT_NMS_PAIRS': {'COMBINATIONS': [[<LayoutType.TABLE>, <LayoutType.TITLE>],
                                       [<LayoutType.TABLE>, <LayoutType.TEXT>],
                                       [<LayoutType.TABLE>, <LayoutType.KEY_VALUE_AREA>],
                                       [<LayoutType.TABLE>, <LayoutType.LIST_ITEM>],
                                       [<LayoutType.TABLE>, <LayoutType.LIST>],
                                       [<LayoutType.TABLE>, <LayoutType.FIGURE>],
                                       [<LayoutType.TITLE>, <LayoutType.TEXT>],
                                       [<LayoutType.TEXT>, <LayoutType.KEY_VALUE_AREA>],
                                       [<LayoutType.TEXT>, <LayoutType.L

d2_model_0829999_layout_inf_only.ts:   0%|          | 0.00/275M [00:00<?, ?B/s]

[32m[0721 12:25.11 @model.py:452][0m  [4m[5m[31mERR[0m  [97mFile downloaded from deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only does not match the expected size! You may have downloaded a broken file, or the upstream may have modified the file.[0m


CASCADE_RCNN_R_50_FPN_GN_TS.yaml:   0%|          | 0.00/143 [00:00<?, ?B/s]

d2_model_1639999_item_inf_only.ts:   0%|          | 0.00/275M [00:00<?, ?B/s]

[32m[0721 12:25.35 @model.py:452][0m  [4m[5m[31mERR[0m  [97mFile downloaded from deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only does not match the expected size! You may have downloaded a broken file, or the upstream may have modified the file.[0m


CASCADE_RCNN_R_50_FPN_GN_TS.yaml:   0%|          | 0.00/143 [00:00<?, ?B/s]

d2_model_1849999_cell_inf_only.ts:   0%|          | 0.00/275M [00:00<?, ?B/s]

[32m[0721 12:25.59 @model.py:452][0m  [4m[5m[31mERR[0m  [97mFile downloaded from deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only does not match the expected size! You may have downloaded a broken file, or the upstream may have modified the file.[0m


CASCADE_RCNN_R_50_FPN_GN_TS.yaml:   0%|          | 0.00/141 [00:00<?, ?B/s]

[32m[0721 12:26.00 @fs.py:142][0m  [32mINF[0m  [97mFile db_resnet50-ac60cadc.pt will be downloaded.[0m
db_resnet50-ac60cadc.pt: |          |102M/?[00:10<00:00,10.1MB/s]
[32m[0721 12:26.10 @fs.py:171][0m  [32mINF[0m  [97mSuccessfully downloaded db_resnet50-ac60cadc.pt. 97.2MiB.[0m
[32m[0721 12:26.11 @fs.py:142][0m  [32mINF[0m  [97mFile crnn_vgg16_bn-9762b0b0.pt will be downloaded.[0m
crnn_vgg16_bn-9762b0b0.pt: |          |63.3M/?[00:06<00:00,10.2MB/s]
[32m[0721 12:26.17 @fs.py:171][0m  [32mINF[0m  [97mSuccessfully downloaded crnn_vgg16_bn-9762b0b0.pt. 60.4MiB.[0m


Downloading https://doctr-static.mindee.com/models?id=v0.3.1/crnn_vgg16_bn-9762b0b0.pt&src=0 to /root/.cache/doctr/models/crnn_vgg16_bn-9762b0b0.pt


  0%|          | 0/63286381 [00:00<?, ?it/s]

[32m[0721 12:26.25 @data.py:92][0m  [32mINF[0m  [97mUsing downloaded & verified file: /root/.cache/doctr/models/crnn_vgg16_bn-9762b0b0.pt[0m
[32m[0721 12:26.25 @order.py:805][0m  [5m[35mWRN[0m  [97mIn most cases floating_text_block_categories must be a subset of text_block_categories. Adding categories to floating_text_block_categories, that do not belong to text_block_categories makes only sense for categories set have CHILD relationships with annotations that belong to text_block_categories.[0m
[32m[0721 12:26.25 @doctectionpipe.py:118][0m  [32mINF[0m  [97mProcessing 2024-nachhaltigkeitsbericht_tab_0.pdf[0m
  return forward_call(*input, **kwargs)
[32m[0721 12:26.27 @context.py:154][0m  [32mINF[0m  [97mImageLayoutService total: 1.763 sec.[0m
[32m[0721 12:26.27 @context.py:154][0m  [32mINF[0m  [97mAnnotationNmsService total: 0.0007 sec.[0m
[32m[0721 12:26.29 @context.py:154][0m  [32mINF[0m  [97mSubImageLayoutService total: 2.1039 sec.[0m
[32m[0721 

2 Seiten analysiert.


In [8]:
inspect_pages_data (pages_data)

Anzahl Seiten: 2

=== Seite 1 ===
Typ: <class 'dict'>
Keys: ['page_number', 'file_name', 'document_id', 'image_id', 'width', 'height', 'text', 'chunks', 'tables']
Seiten-Nummer: 0
Dateiname: 2024-nachhaltigkeitsbericht_tab_0.pdf
Text (erste 100 Zeichen): 'HENKEL NACHHALTIGKEITSBERICHT 2024\n( QB\n161\nVORWORT\nREFERENZ- UND\nBERICHTSRAHMEN\nALLGEMEINE ANGABEN\n'
Anzahl Chunks: 0
Anzahl Tabellen: 1
  Tabelle 1: Typ=<class 'pandas.core.frame.DataFrame'>

=== Seite 2 ===
Typ: <class 'dict'>
Keys: ['page_number', 'file_name', 'document_id', 'image_id', 'width', 'height', 'text', 'chunks', 'tables']
Seiten-Nummer: 1
Dateiname: 2024-nachhaltigkeitsbericht_tab_1.pdf
Text (erste 100 Zeichen): 'HENKEL NACHHALTIGKEITSBERICHT 2024\n( QB\n168\nVORWORT\nREFERENZ- UND\nBERICHTSRAHMEN\nALLGEMEINE ANGABEN\n'
Anzahl Chunks: 0
Anzahl Tabellen: 1
  Tabelle 1: Typ=<class 'pandas.core.frame.DataFrame'>


In [9]:
print(f"Anzahl Seiten in pages_data: {len(pages_data)}")
for i, page in enumerate(pages_data):
    print(f"Seite {i} keys: {list(page.keys())}")
    print(f"Text länge: {len(page['text'])}")
    print(f"Tabellenanzahl: {len(page['tables'])}")
    if len(page['tables']) > 0:
        print(f"Erste Tabelle (head):\n{page['tables'][0].head()}")

Anzahl Seiten in pages_data: 2
Seite 0 keys: ['page_number', 'file_name', 'document_id', 'image_id', 'width', 'height', 'text', 'chunks', 'tables']
Text länge: 766
Tabellenanzahl: 1
Erste Tabelle (head):
  0                                                  1        2         3  \
0                                                                           
1                                           Emissionen           N-1=2023   
2                                Scope-i-THG-tmisionen                      
3    Scope-1-THG-Bruttoemissionen (tCO,e) exklusive...  618.089      n.a.   
4    Scope 1(tCO2e) aus Dampf, Wârme und Elektrizit...  360.792      n.a.   

         4                                     5  \
0                                                  
1   N=2024  Jahrlich% des Ziels/ Vorheriges Jahr   
2                                                  
3  405.621                                  n.a.   
4                                           n.a.   

                    

In [11]:
OUTPUT_JSON = "/repo/notebooks/json/extracted_pages_data.json"  # Beispiel Pfad im Shared Volume
export_pages_data_as_json (pages_data, OUTPUT_JSON)