# Extract data from a Sample Form which has been filled up with hand written text

## Prerequisites
1. To run the code, install the following packages. Please use the latest pre-release version `pip install azure-ai-formrecognizer==3.3.0`.


- > ! pip install azure-ai-formrecognizer==3.3.0

## Load all the API keys, parameters and login credentials

In [3]:
import fr

MY_FORM_RECOGNIZER_ENDPOINT = 'https://tr-docai-form-recognizer.cognitiveservices.azure.com/'

formRecognizerCredential = fr.getFormRecognizerCredential()

from azure.ai.formrecognizer import DocumentAnalysisClient
document_analysis_client = DocumentAnalysisClient(
                                endpoint=MY_FORM_RECOGNIZER_ENDPOINT, 
                                credential=formRecognizerCredential
                            )


Got Azure Form Recognizer API Key from environment variable


## Document Extraction Examples

### Auto Insurance Claims form by hand

- Custom Trained model
- Display label, data and confidence (document level and indivudual field level)
- Text, Checkbox, radio button

In [10]:
formUrl = "https://trxdocaixblob.blob.core.windows.net/docai/test-claims-docs/IC-handwritten-WilliamWordsworth.pdf"

MY_CLAIMS_MODEL_ID = 'claims-v2'

# Make sure your document's type is included in the list of document types the custom model can analyze
poller = document_analysis_client.begin_analyze_document_from_url(MY_CLAIMS_MODEL_ID, formUrl)
result = poller.result()

for idx, document in enumerate(result.documents):
    #print("Document has {}", document)
    print("--------Analyzing document #{}--------".format(idx + 1))
    print("Document has type {}".format(document.doc_type))
    print("Document has confidence {}".format(document.confidence))
    print("Document was analyzed by model with ID {}".format(result.model_id))
    for name, field in document.fields.items():
        field_value = field.value if field.value else field.content
        print("{}[type:{};conf:{}] = '{}'".format(name, field.value_type, field.confidence, field_value))


# iterate over tables, lines, and selection marks on each page
#for page in result.pages:
#    print("\nLines found on page {}".format(page.page_number))
#    for line in page.lines:
#        print("...Line '{}'".format(line.content.encode('utf-8')))
#    for word in page.words:
#        print(
#            "...Word '{}' has a confidence of {}".format(
#                word.content.encode('utf-8'), word.confidence
#            )
#        )
#    for selection_mark in page.selection_marks:
#        print(
#            "...Selection mark is '{}' and has a confidence of {}".format(
#                selection_mark.state, selection_mark.confidence
#            )
#        )

#for i, table in enumerate(result.tables):
#    print("\nTable {} can be found on page:".format(i + 1))
#    for region in table.bounding_regions:
#        print("...{}".format(i + 1, region.page_number))
#    for cell in table.cells:
#        print(
#            "...Cell[{}][{}] has content '{}'".format(
#                cell.row_index, cell.column_index, cell.content.encode('utf-8')
#            )
#        )
#print("-----------------------------------")


--------Analyzing document #1--------
Document has type claims-v2
Document has confidence 0.987
Document was analyzed by model with ID claims-v2
FormType[type:string;conf:0.912] = 'Auto Insurance Claim Document'
Name[type:string;conf:0.963] = 'William Wordsworth'
Address[type:string;conf:0.876] = '39 Washington Street, New York City, NY 10003'
Phone[type:string;conf:0.945] = '+1 123 465 1637'
Email[type:string;conf:0.98] = 'dummy3@3.com'
PolicyNumber[type:string;conf:0.964] = 'TRI 813654329'
IncidentDate[type:string;conf:0.98] = '5/31/2023'
IncidentTime[type:string;conf:0.959] = '11 pm BST'
IncidentLocation[type:string;conf:0.905] = '2 Daffodil Street, New York City, NY 1002'
IncidentDescription[type:string;conf:0.92] = 'Another Car changed lane and hit my car on the driver side.'
VehicleOwner[type:string;conf:0.975] = 'NA'
VehicleMakeAndModel[type:string;conf:0.948] = '2011 Kia Sorento'
VIN[type:string;conf:0.983] = '5XYKU4A12BG001739'
LicensePlateNumber[type:string;conf:0.97] = 'FWK-

### SEC Quarterly report 
- PDF
- Title
- Checkboxes
- Table

In [4]:
# sample document
formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"

poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
result = poller.result()

for idx, style in enumerate(result.styles):
    print(
        "Document contains {} content".format(
         "handwritten" if style.is_handwritten else "no handwritten"
        )
    )

for page in result.pages:
    for line_idx, line in enumerate(page.lines):
        print(
         "...Line # {} has text content '{}'".format(
        line_idx,
        line.content.encode("utf-8")
        )
    )

    for selection_mark in page.selection_marks:
        print(
         "...Selection mark is '{}' and has a confidence of {}".format(
         selection_mark.state,
         selection_mark.confidence
         )
    )

for table_idx, table in enumerate(result.tables):
    print(
        "Table # {} has {} rows and {} columns".format(
        table_idx, table.row_count, table.column_count
        )
    )
        
    for cell in table.cells:
        print(
            "...Cell[{}][{}] has content '{}'".format(
            cell.row_index,
            cell.column_index,
            cell.content.encode("utf-8"),
            )
        )

print("----------------------------------------")

Document contains handwritten content
...Line # 0 has text content 'b'UNITED STATES''
...Line # 1 has text content 'b'SECURITIES AND EXCHANGE COMMISSION''
...Line # 2 has text content 'b'Washington, D.C. 20549''
...Line # 3 has text content 'b'FORM 10-Q''
...Line # 4 has text content 'b'X''
...Line # 5 has text content 'b'QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF''
...Line # 6 has text content 'b'1934''
...Line # 7 has text content 'b'For the Quarterly Period Ended March 31, 2020''
...Line # 8 has text content 'b'OR''
...Line # 9 has text content 'b'TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF''
...Line # 10 has text content 'b'1934''
...Line # 11 has text content 'b'For the Transition Period From''
...Line # 12 has text content 'b'to''
...Line # 13 has text content 'b'Commission File Number 001-37845''
...Line # 14 has text content 'b'MICROSOFT CORPORATION''
...Line # 15 has text content 'b'WASHINGTON''
..

### Auto Insurance Claims form by hand

- Custom Trained model
- Line by line dump of extracted data
- Table rows and columns display

In [5]:
formUrl = "https://trxdocaixblob.blob.core.windows.net/docai/test-claims-docs/IC-handwritten-WilliamWordsworth.pdf"

poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
result = poller.result()

for idx, style in enumerate(result.styles):
    print(
        "Document contains {} content".format(
         "handwritten" if style.is_handwritten else "no handwritten"
        )
    )

for page in result.pages:
    for line_idx, line in enumerate(page.lines):
        print(
         "...Line # {} has text content '{}'".format(
        line_idx,
        line.content.encode("utf-8")
        )
    )

    for selection_mark in page.selection_marks:
        print(
         "...Selection mark is '{}' and has a confidence of {}".format(
         selection_mark.state,
         selection_mark.confidence
         )
    )

for table_idx, table in enumerate(result.tables):
    print(
        "Table # {} has {} rows and {} columns".format(
        table_idx, table.row_count, table.column_count
        )
    )
        
    for cell in table.cells:
        print(
            "...Cell[{}][{}] has content '{}'".format(
            cell.row_index,
            cell.column_index,
            cell.content.encode("utf-8"),
            )
        )

print("----------------------------------------")

Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
...Line # 0 has text content 'b'TR INSURED''
...Line # 1 has text content 'b'A Test P&C INSURANCE Company''
...Line # 2 has text content 'b'Auto Insurance Claim Document''
...Line # 3 has text content 'b'Customer Information''
...Line # 4 has text content 'b'Name William Wordsworth''
...Line # 5 has text content 'b'Address 39 Washington Street, New York City, NY 10003''
...Line # 6 has text content 'b'Phone Number +1 123 465 1637''
...Line # 7 has text content 'b'Email dummy3@3.com''
...Line # 8 has text content 'b'Policy Number TRI 813654329''
...Line # 9 has text content 'b'Incident Information''
...Line # 10 has text content 'b'Date of Incident 5/31/2023''
...Line # 11 has text conten