#### Imports:

In [1]:
import json
from typing import Any
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
import os

#### Load credentials:

In [2]:
# Load environment variables from the .env file
load_dotenv("credentials.env")

# Get credentials from environment variables
endpoint = os.getenv("FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("FORM_RECOGNIZER_KEY")

#### Methods for extracting text, tables, structure of pdf and save as .json:

In [3]:
def extract_text(result: Any) -> list:
    """
    Extracts text contents from the analysis result.

    @param result: The analysis result object from the Azure Form Recognizer.
    @type result: Any
    
    @return: A list containing the text contents.
    @rtype: list
    """
    texts = []
    for page in result.pages:
        for line in page.lines:
            texts.append(line.content)
    return texts


def extract_structure(result: Any) -> list:
    """
    Extracts structural elements from the analysis result.

    @param result: The analysis result object from the Azure Form Recognizer.
    @type result: Any
    
    @return: A list containing the structural elements.
    @rtype: list
    """
    structures = []
    for page in result.pages:
        structure_data = {
            "page_number": page.page_number,
            "width": page.width,
            "height": page.height,
            "unit": page.unit,
            "lines": []
        }

        for line in page.lines:
            line_data = {
                "content": line.content,
                "spans": [{"offset": span.offset, "length": span.length} for span in line.spans]
            }
            structure_data["lines"].append(line_data)
        
        structures.append(structure_data)
    return structures

    
def extract_tables(result: Any) -> list:
    """
    Extracts table contents from the analysis result.

    @param result: The analysis result object from the Azure Form Recognizer.
    @type result: Any
    
    @return: A list containing the table contents.
    @rtype: list
    """
    tables = []
    for table_idx, table in enumerate(result.tables):
        table_data = {
            "table_index": table_idx,
            "row_count": table.row_count,
            "column_count": table.column_count,
            "cells": []
        }

        for cell in table.cells:
            cell_data = {
                "row_index": cell.row_index,
                "column_index": cell.column_index,
                "content": cell.content,
                "bounding_regions": [{"page_number": region.page_number, "polygon": [point for point in region.polygon]} for region in cell.bounding_regions]
            }
            table_data["cells"].append(cell_data)

        tables.append(table_data)
    return tables


def save_json(data: Any, filename: str) -> None:
    """
    Saves the provided data as a JSON file.

    @param data: The data to save.
    @type data: Any
    
    @param filename: The name of the file to save the data to.
    @type filename: str
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

#### Initialize document analysis:

In [4]:
# Initialize DocumentAnalysisClient
document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))

# URL of the PDF to analyze
form_url = "LINK TO PDF"
base_path = "results/"

# Start document analysis
poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", form_url)
result = poller.result()

#### Call methods:

In [5]:
# Extract and save text
text_data = extract_text(result)
save_json(text_data, os.path.join(base_path, "text.json"))

# Extract and save structures
structure_data = extract_structure(result)
save_json(structure_data, os.path.join(base_path, "structures.json"))

# Extract and save tables
table_data = extract_tables(result)
print(table_data)
save_json(table_data, os.path.join(base_path, "tables.json"))

# Combine all extracted data into one JSON
document_data = {
    "text": text_data,
    "structures": structure_data,
    "tables": table_data
}
save_json(document_data, os.path.join(base_path, "document.json"))

[{'table_index': 0, 'row_count': 13, 'column_count': 5, 'cells': [{'row_index': 0, 'column_index': 0, 'content': 'Komponente', 'bounding_regions': [{'page_number': 1, 'polygon': [Point(x=0.5043, y=4.5279), Point(x=1.9508, y=4.5279), Point(x=1.9583, y=4.7516), Point(x=0.5043, y=4.7516)]}]}, {'row_index': 0, 'column_index': 1, 'content': 'Zur Befestigung von', 'bounding_regions': [{'page_number': 1, 'polygon': [Point(x=1.9508, y=4.5279), Point(x=3.3974, y=4.5279), Point(x=3.3974, y=4.7516), Point(x=1.9583, y=4.7516)]}]}, {'row_index': 0, 'column_index': 2, 'content': 'Schraubentyp', 'bounding_regions': [{'page_number': 1, 'polygon': [Point(x=3.3974, y=4.5279), Point(x=4.8588, y=4.5279), Point(x=4.8588, y=4.7516), Point(x=3.3974, y=4.7516)]}]}, {'row_index': 0, 'column_index': 3, 'content': 'Menge', 'bounding_regions': [{'page_number': 1, 'polygon': [Point(x=4.8588, y=4.5279), Point(x=6.2829, y=4.5279), Point(x=6.2829, y=4.7516), Point(x=4.8588, y=4.7516)]}]}, {'row_index': 0, 'column_ind