### Please install the required Python modules/SDKs

In [None]:
! activate ai-azure-c1
import sys
sys.path.append("/opt/conda/envs/ai-azure-c1/lib/python3.8/site-packages")

# Form Recognizer: Text & Layout Extraction Demo

## Importing Azure Form Recognizer Python modules

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import FormRecognizerClient

In [None]:
AZURE_FORM_RECOGNIZER_ENDPOINT = "ENTER FORM RECOGNIZER ENDPOINT"
AZURE_FORM_RECOGNIZER_KEY = "ENTER FORM RECOGNIZER KEY"

In [None]:
endpoint = AZURE_FORM_RECOGNIZER_ENDPOINT
key = AZURE_FORM_RECOGNIZER_KEY

## Instantiating Object

In [None]:
form_recognizer_client = FormRecognizerClient(endpoint=endpoint, credential=AzureKeyCredential(key))

## Source Document

In [None]:
content_url = "https://github.com/udacity/cd0461-building-computer-vision-solutions-with-azure-exercises/raw/main/resources/f1040.pdf"

In [None]:
result_from_url = form_recognizer_client.begin_recognize_content_from_url(content_url)

In [None]:
result_data = result_from_url.result()

## Raw Data

In [None]:
result_data

In [None]:
len(result_data)

In [None]:
# Page 1
content = result_data[0]

In [None]:
# Page 2
content_page_2 = result_data[1]

In [None]:
content.to_dict()

In [None]:
content_page_2.to_dict()

## Accessing all the Lines

In [None]:
len(content.lines)

## Exploring content Object (Try by yourself)

In [None]:
content.to_dict()

In [None]:
print("Page has width: {} and height: {}, measured with unit: {}".format(content.width, content.height, content.unit))

In [None]:
content_page_2.to_dict()

In [None]:
print("Page has width: {} and height: {}, measured with unit: {}".format(content_page_2.width, content_page_2.height, content_page_2.unit))


### Utility Functions

In [None]:
# --------------------------
# Code Snippet Source 
# https://github.com/Azure-Samples/cognitive-services-quickstart-code/blob/master/python/ComputerVision/REST/python-disk.md
# --------------------------
def format_bounding_box(bounding_box):
    if not bounding_box:
        return "N/A"
    return ", ".join(["[{}, {}]".format(p.x, p.y) for p in bounding_box])

## All words extracted from the Form 1040

In [None]:
# --------------------------
# Code Snippet Source 
# https://github.com/Azure-Samples/cognitive-services-quickstart-code/blob/master/python/ComputerVision/REST/python-disk.md
# --------------------------

for line_idx, line in enumerate(content.lines):
    print("Line # {} has word count '{}' and text '{}' within bounding box '{}'".format(
        line_idx,
        len(line.words),
        line.text,
        format_bounding_box(line.bounding_box)
    ))
    if line.appearance:
        if line.appearance.style_name == "handwriting" and line.appearance.style_confidence > 0.8:
            print("Text line '{}' is handwritten and might be a signature.".format(line.text))
    for word in line.words:
        print("...Word '{}' has a confidence of {}".format(word.text, word.confidence))

## Tables with Rows and Columns in the Form 1040

In [None]:
for table_idx, table in enumerate(content.tables):
    print("Table # {} has {} rows and {} columns".format(table_idx, table.row_count, table.column_count))
    print("Table # {} location on page: {}".format(table_idx, format_bounding_box(table.bounding_box)))
    for cell in table.cells:
        print("...Cell[{}][{}] has text '{}' within bounding box '{}'".format(cell.row_index, cell.column_index, cell.text, format_bounding_box(cell.bounding_box)))

## All Selection Marks with their current state in the Form 1040

In [None]:
# --------------------------
# Code Snippet Source 
# https://github.com/Azure-Samples/cognitive-services-quickstart-code/blob/master/python/ComputerVision/REST/python-disk.md
# --------------------------
for selection_mark in content.selection_marks:
    print("Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format(
        selection_mark.state,
        format_bounding_box(selection_mark.bounding_box),
        selection_mark.confidence
    ))