# Sample Form filled with hand written text

- Extract using Azure Document Intelligence Service
- Post Processing error fixes with AOAI GPT-4 

## Prerequisites
1. To run the code, install the following packages. Please use the latest pre-release version `pip install azure-ai-documentintelligence==1.0.0`.


- > ! pip install azure-ai-documentintelligence==1.0.0

## Load all the API keys, parameters and login credentials

In [6]:
import os
import docintel

# Your Azure Document Intelligence Service Instance
DOC_INTEL_ENDPOINT = os.getenv('FORM_RECOGNIZER_ENDPOINT')
DOC_INTEL_API_KEY = os.getenv("FORM_RECOGNIZER_API_KEY")

documentIntelligenceCredential = docintel.getDocumentIntelligenceCredential(DOC_INTEL_API_KEY)

documentIntelligenceClient = docintel.getDocumentIntelligenceClient(
                                endpoint=DOC_INTEL_ENDPOINT,
                                credential=documentIntelligenceCredential
                                )


Got Azure Form Recognizer API Key from environment variable


In [7]:
MY_TEST_DOCUMENT = r'C:\Users\tibarar\OneDrive - Microsoft\Desktop\DocAI - LocalDocs\CommercialInsuranceApplications\MacMohanFinancialAdvisor-COMMERCIAL INSURANCE APPLICATION.pdf'
#MY_TEST_DOCUMENT = r'C:\Users\tibarar\OneDrive - Microsoft\Desktop\DocAI - LocalDocs\AutoInsuranceClaims\InsuranceClaim-WilliamWordworth.pdf'

# The model id should match the custom model you have
# trained and deployed in your Azure Document Intelligence Service Instance
# with the endpoint MY_FORM_RECOGNIZER_ENDPOINT
MY_CLAIMS_MODEL_ID = 'commercialInsuranceApplicationExtraction-v1'
#MY_CLAIMS_MODEL_ID = 'autoInsuranceClaimExtraction-v2'

fr_api_version, model_id, is_handwritten, result = docintel.extractResultFromLocalDocument(
                                                        client=documentIntelligenceClient,
                                                        model=MY_CLAIMS_MODEL_ID,
                                                        filepath=MY_TEST_DOCUMENT
                                                    )

print(f'Document Intelligence API version = {fr_api_version}\n \
        Document Extraction Model Id = {model_id}\n \
        Does document have any hand written text? {is_handwritten}\n'
     )
doc_count = len(result.documents)
print(f'Document count = {doc_count}')

Document Intelligence API version = 2024-11-30
         Document Extraction Model Id = commercialInsuranceApplicationExtraction-v1
         Does document have any hand written text? True

Document count = 1


## Print returned document fields in API response

In [24]:
if result.documents:
    for idx, document in enumerate(result.documents):
        print(f'Document {idx} ---------------')
        print(f"Document has type {document.doc_type}")
        print(f'Document extraction confidence = {document.confidence}')
        print(f"Document was analyzed with model with ID {result.model_id}")
        if document.fields:
            for name, field in document.fields.items():
                field_value = field.get("valueString") if field.get("valueString") else field.content
                print("\t{}[type:{};confidence:{}] = '{}'".format(name, field.type, field.confidence, field_value))
print("-----------------------------------")


Document 0 ---------------
Document has type commercialInsuranceApplicationExtraction-v1
Document extraction confidence = 0.999
Document was analyzed with model with ID commercialInsuranceApplicationExtraction-v1
	FormType[type:DocumentFieldType.STRING;confidence:0.954] = 'COMMERCIAL INSURANCE APPLICATION APPLICANT INFORMATION SECTION'
	ApplicationDate[type:DocumentFieldType.STRING;confidence:0.981] = '11/20/2023'
	AgencyName[type:DocumentFieldType.STRING;confidence:0.932] = 'Alex & Chen Brokers Inc'
	AgencyAddress[type:DocumentFieldType.STRING;confidence:0.913] = '7268 Brookridge Central Blvd
Brooksville
FL 34613
FL USA'
	CarrierName[type:DocumentFieldType.STRING;confidence:0.978] = 'GCML'
	NAICCode[type:DocumentFieldType.STRING;confidence:0.668] = '123'
	CompanyName[type:DocumentFieldType.STRING;confidence:0.974] = 'MacMohan Financial Advisors LLC'
	ProgramCode[type:DocumentFieldType.STRING;confidence:0.982] = 'FA-2354'
	PolicyNumber[type:DocumentFieldType.STRING;confidence:0.983] = 

## Extract data from tables

In [50]:
from collections import Counter

def compose_table(header_names, table_data):
    table_with_header = []
    # Print a two-dimensional array like a table.
    max_len_list = []
    for i in range(len(header_names)):
        col_values = list(map(lambda row: len(str(row[i])), table_data))
        col_values.append(len(str(header_names[i])))
        max_len_list.append(max(col_values))

    table_with_header.append(header_names)
    
    row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))

    #print(row_format_str.format(*header_names))
    for row in table_data:
        if row:
            table_with_header.append(row)
     #       if row == None:
     #           print(row_format_str.format(*row))
     #       else:
     #           print(row)
    
    return table_with_header

def extract_tables(result):
    extracted_tables = {}
    if result.documents:    
        SYMBOL_OF_TABLE_TYPE = "array"
        SYMBOL_OF_OBJECT_TYPE = "object"
        KEY_OF_VALUE_OBJECT = "valueObject"
        KEY_OF_CELL_CONTENT = "content"

        tableCount = 0
        for doc in result.documents:
            if not doc.fields is None:
                for field_name, field_value in doc.fields.items():
                    # Dynamic Table cell information store as array in document field.
                    if field_value.type == SYMBOL_OF_TABLE_TYPE and field_value.value_array:
                        table_name = f'dynamicTable:{field_name}'
                        col_names = []
                        sample_obj = field_value.value_array[0]
                        if KEY_OF_VALUE_OBJECT in sample_obj:
                            col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
                        #print("----Extracting Dynamic Table Cell Values----")
                        table_rows = []                       
                        for obj in field_value.value_array:
                            if KEY_OF_VALUE_OBJECT in obj:
                                value_obj = obj[KEY_OF_VALUE_OBJECT]
                                extract_value_by_col_name = lambda key: (
                                    value_obj[key].get(KEY_OF_CELL_CONTENT)
                                    if key in value_obj and KEY_OF_CELL_CONTENT in value_obj[key]
                                    else "None"
                                )
                                row_data = list(map(extract_value_by_col_name, col_names))
                                table_rows.append(row_data)
                                table_with_header.append(row_data)
                        the_table = compose_table(col_names, table_rows)
                        if table_name:
                            extracted_tables[f'{tableCount}-{table_name}'] = the_table
                        else:
                            extracted_tables[f'{tableCount}-unknown'] = the_table
                        tableCount = tableCount + 1
                    elif (
                        field_value.type == SYMBOL_OF_OBJECT_TYPE
                        and KEY_OF_VALUE_OBJECT in field_value
                        and field_value[KEY_OF_VALUE_OBJECT] is not None
                    ):
                        rows_by_columns = list(field_value[KEY_OF_VALUE_OBJECT].values())
                        is_fixed_table = all(
                            (
                                rows_of_column["type"] == SYMBOL_OF_OBJECT_TYPE
                                and Counter(list(rows_by_columns[0][KEY_OF_VALUE_OBJECT].keys()))
                                == Counter(list(rows_of_column[KEY_OF_VALUE_OBJECT].keys()))
                            )
                            for rows_of_column in rows_by_columns
                        )

                        # Fixed Table cell information store as object in document field.
                        if is_fixed_table:
                            table_name = f'fixedTable:{field_name}'
                            #print("----Extracting Fixed Table Cell Values----")
                            col_names = list(field_value[KEY_OF_VALUE_OBJECT].keys())
                            row_dict: dict = {}
                            for rows_of_column in rows_by_columns:
                                rows = rows_of_column[KEY_OF_VALUE_OBJECT]
                                for row_key in list(rows.keys()):
                                    if row_key in row_dict:
                                        row_dict[row_key].append(rows[row_key].get(KEY_OF_CELL_CONTENT))
                                    else:
                                        row_dict[row_key] = [
                                            row_key,
                                            rows[row_key].get(KEY_OF_CELL_CONTENT),
                                        ]

                            col_names.insert(0, "")
                            the_table = compose_table(col_names, list(row_dict.values()))
                            if table_name:
                                extracted_tables[f'{tableCount}-{table_name}'] = the_table
                            else:
                                extracted_tables[f'{tableCount}-unknown'] = the_table
                            tableCount = tableCount + 1
    return extracted_tables

theTables = extract_tables(result)

for tableName in theTables:
    print(f'Table Name: {tableName}')
    print(theTables[tableName])
    print("----------------------")

Table Name: 0-fixedTable:LOBTable
[['', 'ROW1', 'ROW2', 'ROW3', 'ROW4', 'ROW5', 'ROW6', 'ROW7'], ['LOBNameCol1', 'BOILER & MACHINERY', 'BUSINESS AUTO', 'BUSINESS OWNERS', 'COMMERCIAL GENERAL LIABILITY', 'COMMERCIAL INLAND MARINE', 'COMMERCIAL PROPERTY', 'CRIME'], ['LOBPremiumCol1', '$', '$', '$', '$ 9500', '$', '$ 3500', '$'], ['LOBIndicatorCol2', 'X', None, None, None, None, None, 'X'], ['LOBNameCol2', 'CYBER AND PRIVACY', 'FIDUCIARY LIABILITY', 'GARAGE AND DEALERS', 'LIQUOR LIABILITY', 'MOTOR CARRIER', 'TRUCKERS', 'UMBRELLA'], ['LOBPremiumCol2', '$ 6500', '$', '$', '$', '$', '$', '$ 1000'], ['LOBNameCol3', 'YACHT', None, None, None, None, None, None], ['LOBPremiumCol3', '$', '$', '$', '$', '$', '$', '$'], ['LOBIndicatorCol1', None, None, None, 'X', None, 'X', None], ['LOBIndicatorCol3', None, None, None, None, None, None, None]]
----------------------
Table Name: 1-fixedTable:SafetyCodeViolationHistory
[['', 'ROW1', 'ROW2'], ['OCCUR DATE', None, None], ['EXPLANATION', None, None], ['

## Extract documents from files in blob store

In [2]:
test_file_url = os.getenv('BLOB_TEST_FILE_JEAN_GENET_SAS_URL')
fr_api_version, model_id, is_handwritten, result = docintel.extractResultFromOnlineDocument(
                                                        client=documentIntelligenceClient,
                                                        model=MY_CLAIMS_MODEL_ID,
                                                        url=test_file_url
                                                    )

print(f'Document Intelligence API version = {fr_api_version}\n \
        Document Extraction Model Id = {model_id}\n \
        Does document have any hand written text? {is_handwritten}\n'
     )
doc_count = len(result.documents)
print(f'Document count = {doc_count}')

Document Intelligence API version = 2024-11-30
         Document Extraction Model Id = autoInsuranceClaimExtraction-v2
         Does document have any hand written text? True

Document count = 1


In [3]:
print(f'documents:{result.documents}')

documents:[{'docType': 'autoInsuranceClaimExtraction-v2:autoInsuranceClaimExtraction-v2', 'boundingRegions': [{'pageNumber': 1, 'polygon': [0, 0, 8.2639, 0, 8.2639, 10.6389, 0, 10.6389]}, {'pageNumber': 2, 'polygon': [0, 0, 8.2639, 0, 8.2639, 10.625, 0, 10.625]}], 'fields': {'VIN': {'type': 'string', 'valueString': 'KLHAB2C24X3615045', 'content': 'KLHAB2C24X3615045', 'boundingRegions': [{'pageNumber': 1, 'polygon': [3.88, 8.31, 6.55, 8.31, 6.55, 8.55, 3.88, 8.55]}], 'confidence': 0.99, 'spans': [{'offset': 631, 'length': 17}]}, 'Address': {'type': 'string', 'valueString': '12 Independence Way, Boston, MA 11201', 'content': '12 Independence Way, Boston, MA 11201', 'boundingRegions': [{'pageNumber': 1, 'polygon': [1.735, 4.065, 6.575, 4.065, 6.575, 4.36, 1.735, 4.36]}], 'confidence': 0.99, 'spans': [{'offset': 115, 'length': 37}]}, 'DateSigned': {'type': 'string', 'valueString': '2/2/2023', 'content': '2/2/2023', 'boundingRegions': [{'pageNumber': 2, 'polygon': [5.885, 7.97, 7.135, 7.97,

## Post Processing after extraction, to fix errors

<b><u>Example</u></b>  
The <b>IncidentTime</b> extracted from the document shows - '11 pm <font color=red>BST</font>'  
The <b>IncidentLocation</b> extracted from the document shows - '2 Daffodil Street, New York City, NY 1002'  
The error here is in the time zone. It should be <b>EDT</b> instead of <b>BST</b>  
Let's fix it with GPT-4 using AOAI  


#### Load the AOAI keys and parameters

In [6]:
import aoai

MY_AOAI_ENDPOINT = 'https://tr-non-prod-gpt4.openai.azure.com/'
MY_AOAI_VERSION = '2023-07-01-preview'
MY_GPT_ENGINE = 'tr-gpt4'
MY_AOAI_EMBEDDING_ENGINE = 'tr-embedding-ada'

status = aoai.setupOpenai(aoai_endpoint=MY_AOAI_ENDPOINT, 
                 aoai_version=MY_AOAI_VERSION)
if status > 0:
    print("AOAI setup succeeded")
else:
    print("AOAI setup failed")


Got OPENAI API Key from environment variable
AOAI setup succeeded


#### Ask GPT-4 to fix the error

In [7]:
my_location = '2 Daffodil Street, New York City, NY 1002'
my_time = '11 pm BST'
my_date = '5/31/2023'

my_task = f'Replace the timezone only in {my_time} with the timezone of the location in {my_location}, \
            given the date {my_date}'
my_prompt = [
              {
                "role": "user", 
                "content": my_task
                }
            ]      
tokens_used, finish_reason, aoai_answer = aoai.getChatCompletion(the_engine=MY_GPT_ENGINE, 
                                                                           the_messages=my_prompt)
print(f"Tokens: {tokens_used}")
print(f"Finish Reason: {finish_reason}")
print(f"Answer: {aoai_answer}")

Tokens: 54
Finish Reason: stop
Answer: 11 pm EDT


## If you want to read all the raw OCR data from the extraction

#### View the extracted raw data pages, tables...

In [7]:
for page in result.pages:
    for line_idx, line in enumerate(page.lines):
        print(
         "...Line # {} has text content '{}'".format(
        line_idx,
        line.content.encode("utf-8")
        )
    )

    for selection_mark in page.selection_marks:
        print(
         "...Selection mark is '{}' and has a confidence of {}".format(
         selection_mark.state,
         selection_mark.confidence
         )
    )

for table_idx, table in enumerate(result.tables):
    print(
        "Table # {} has {} rows and {} columns".format(
        table_idx, table.row_count, table.column_count
        )
    )
        
    for cell in table.cells:
        print(
            "...Cell[{}][{}] has content '{}'".format(
            cell.row_index,
            cell.column_index,
            cell.content.encode("utf-8"),
            )
        )

print("----------------------------------------")

...Line # 0 has text content 'b'TR INSURED''
...Line # 1 has text content 'b'A Test P&C INSURANCE Company''
...Line # 2 has text content 'b'Auto Insurance Claim Document''
...Line # 3 has text content 'b'Customer Information''
...Line # 4 has text content 'b'Name William Wordsworth''
...Line # 5 has text content 'b'Address 39 Washington Street, New York City, NY 10003''
...Line # 6 has text content 'b'Phone Number +1 123 465 1637''
...Line # 7 has text content 'b'Email dummy3@3.com''
...Line # 8 has text content 'b'Policy Number TRI 813654329''
...Line # 9 has text content 'b'Incident Information''
...Line # 10 has text content 'b'Date of Incident 5/31/2023''
...Line # 11 has text content 'b'Time of Incident 11 pm BST''
...Line # 12 has text content 'b'Location of Incident 2 Daffodil Street, New York City, NY 1002''
...Line # 13 has text content 'b'Description of Incident Another Car changed lane and hit''
...Line # 14 has text content 'b'my car on the driver side.''
...Line # 15 has t