## Azure AI Document Intelligence documentation

Azure AI Document Intelligence (formerly Form Recognizer) is a cloud-based Azure AI service that uses machine-learning models to automate your data processing in applications and workflows. Document Intelligence is essential for enhancing data-driven strategies and enriching document search capabilities.

https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/?view=doc-intel-3.1.0

In [None]:
%pip install azure-ai-formrecognizer==3.3.0
%pip install python-dotenv

In [8]:
from dotenv import load_dotenv
import os
import pandas as pd
import datetime 

# load the .env file from the same directory
load_dotenv()


"""
This code sample shows Prebuilt Layout operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs
https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recxognizer/quickstarts/try-v3-python-sdk
"""

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = os.environ.get("YOUR_FORM_RECOGNIZER_ENDPOINT")
key = os.environ.get("YOUR_FORM_RECOGNIZER_KEY")

# sample document
formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"

#formUrl = "https://www.bundesnetzagentur.de/DE/Beschlusskammern/1_GZ/BK8-GZ/2019/2019_5-Steller/BK8-19-00002_A_bis_BK8-19-00006A/Downloads/BK8-19-00002_A_bis_BK8-19-00006_A_Anlagen_Download_BF.pdf?__blob=publicationFile&v=2"

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)
    
poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
result = poller.result()

# for idx, style in enumerate(result.styles):
#     print(
#         "Document contains {} content".format(
#          "handwritten" if style.is_handwritten else "no handwritten"
#         )
#     )

# for page in result.pages:
#     for line_idx, line in enumerate(page.lines):
#         print(
#          "...Line # {} has text content '{}'".format(
#         line_idx,
#         line.content.encode("utf-8")
#         )
#     )

#     for selection_mark in page.selection_marks:
#         print(
#          "...Selection mark is '{}' and has a confidence of {}".format(
#          selection_mark.state,
#          selection_mark.confidence
#          )
#     )


markdown_text = ''

# Iterate over each table in the result
for table_idx, table in enumerate(result.tables):

    # Add the table title information
    markdown_text += "## Table # {}\n*This table has {} rows and {} columns.*\n".format(
        table_idx, table.row_count, table.column_count
    )

    # Create the table header line in Markdown, add to markdown_text
    header_line = "| " + " | ".join([f"Column {i}" for i in range(1, table.column_count+1)]) + " |"
    separator_line = "| " + " | ".join(['---' for _ in range(1, table.column_count+1)]) + " |"

    markdown_text += header_line + '\n'
    markdown_text += separator_line + '\n'

    # Create a nested dictionary to hold cell contents properly
    table_in_md = {}

    for cell in table.cells:
        if cell.row_index not in table_in_md:
            table_in_md[cell.row_index] = {}
        table_in_md[cell.row_index][cell.column_index] = cell.content

    for row_index, row in sorted(table_in_md.items()):
        row_content = ["`{}`".format(cell_content) if cell_content else ' ' for cell_index, cell_content in sorted(row.items())]
        row_line = "| " + " | ".join(row_content) + " |"
        markdown_text += row_line + '\n'


# Finally printing out the markdown string
print(markdown_text)

for table_idx, table in enumerate(result.tables):

    table_in_csv = {}

    for cell in table.cells:
        if cell.row_index not in table_in_csv:
            table_in_csv[cell.row_index] = {}
        table_in_csv[cell.row_index][cell.column_index] = cell.content

    df = pd.DataFrame(table_in_csv).transpose()  # converting dictionary to dataframe
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # get current time in year-month-day-hoursminutesseconds format
    df.to_csv(f'table_{table_idx}_{current_time}.csv',sep=';', index=False)  # save each table as CSV


## Table # 0
*This table has 5 rows and 3 columns.*
| Column 1 | Column 2 | Column 3 |
| --- | --- | --- |
| `Title of each class` | `Trading Symbol` | `Name of exchange on which registered` |
| `Common stock, $0.00000625 par value per share` | `MSFT` | `NASDAQ` |
| `2.125% Notes due 2021` | `MSFT` | `NASDAQ` |
| `3.125% Notes due 2028` | `MSFT` | `NASDAQ` |
| `2.625% Notes due 2033` | `MSFT` | `NASDAQ` |
## Table # 1
*This table has 2 rows and 2 columns.*
| Column 1 | Column 2 |
| --- | --- |
| `Class` | `Outstanding as of April 24, 2020` |
| `Common Stock, $0.00000625 par value per share` | `7,583,440,247 shares` |



In [9]:
from dotenv import load_dotenv
import os
import pandas as pd
import datetime 

# load the .env file from the same directory
load_dotenv()


"""
This code sample shows Prebuilt Layout operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs
https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recxognizer/quickstarts/try-v3-python-sdk
"""

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = os.environ.get("YOUR_FORM_RECOGNIZER_ENDPOINT")
key = os.environ.get("YOUR_FORM_RECOGNIZER_KEY")

# sample document
formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"

#formUrl = "https://www.bundesnetzagentur.de/DE/Beschlusskammern/1_GZ/BK8-GZ/2019/2019_5-Steller/BK8-19-00002_A_bis_BK8-19-00006A/Downloads/BK8-19-00002_A_bis_BK8-19-00006_A_Anlagen_Download_BF.pdf?__blob=publicationFile&v=2"

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)
    
poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
result = poller.result()

# for idx, style in enumerate(result.styles):
#     print(
#         "Document contains {} content".format(
#          "handwritten" if style.is_handwritten else "no handwritten"
#         )
#     )

# for page in result.pages:
#     for line_idx, line in enumerate(page.lines):
#         print(
#          "...Line # {} has text content '{}'".format(
#         line_idx,
#         line.content.encode("utf-8")
#         )
#     )

#     for selection_mark in page.selection_marks:
#         print(
#          "...Selection mark is '{}' and has a confidence of {}".format(
#          selection_mark.state,
#          selection_mark.confidence
#          )
#     )


markdown_text = ''

# Iterate over each table in the result
for table_idx, table in enumerate(result.tables):

    # Create a nested dictionary to hold cell contents properly
    data = {}
  
    for cell in table.cells:
        if cell.row_index not in data:
            data[cell.row_index] = {}
        data[cell.row_index][cell.column_index] = cell.content

    # Convert dictionary to DataFrame
    df = pd.DataFrame.from_dict(data, orient='index')
   
    # Sort DataFrame
    df.sort_index(axis=1, inplace=True)

    # Generate file name based on current time
    filename = datetime.now().strftime("%Y%m%d%H%M%S") + '.html'
    
    # Save DataFrame to HTML
    df.to_html(filename)

    print(f"Saved html table to {filename}")
‚‚
# Finally printing out the markdown string
print(markdown_text)

for table_idx, table in enumerate(result.tables):

    table_in_csv = {}

    for cell in table.cells:
        if cell.row_index not in table_in_csv:
            table_in_csv[cell.row_index] = {}
        table_in_csv[cell.row_index][cell.column_index] = cell.content

    df = pd.DataFrame(table_in_csv).transpose()  # converting dictionary to dataframe
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # get current time in year-month-day-hoursminutesseconds format
    df.to_csv(f'table_{table_idx}_{current_time}.csv',sep=';', index=False)  # save each table as CSV


AttributeError: module 'datetime' has no attribute 'now'

{{markdown_text}}