# Document AI Synchronous API
This notebook shows you how use Python to make synchronous calls to the Document AI API

You must replace the `processor_id` variable value in the the second cell with the appropriate value for the Processor ID of the Document AI processor that you want to use. The processor may not support all of the Document AI output properties. Entity data is only returned by processors that use specialized parsers for example. 

In [None]:
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from prettytable import PrettyTable

project_id=%system gcloud config get-value core/project
project_id=project_id[0]
location = 'us'           # Replace with 'eu' if processor does not use 'us' location
file_path = 'form.pdf'    # Replace this with the name of the pdf file to process if necessary


In [None]:
# Set your Processor ID
processor_id = '23c5443efe9f90b5' # TODO: Replace with a valid Processor ID 

In [None]:
# Process Document Function
def process_document(
        project_id=project_id, location=location, 
        processor_id=processor_id,  file_path=file_path
):
    # Instantiates a client
    client = documentai.DocumentProcessorServiceClient()
    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    with open(file_path, "rb") as image:
        image_content = image.read()
    # Read the file into memory
    document = {"content": image_content, "mime_type": "application/pdf"}
    # Configure the process request
    request = {"name": name, "document": document}
    # Use the Document AI client to process the sample form
    result = client.process_document(request=request)
    return result.document

In [None]:
# Process Document 
document=process_document()
# print all detected text. 
# All document processors will display the text content
print("Document processing complete.")
print("Text: {}".format(document.text))

In [None]:
# Get Text Function
# Define a function to retrieve an object dictionary for a named element
def get_text(doc_element: dict, document: dict):
    """
    Document AI identifies form fields by their offsets
    in document text. This function converts offsets
    to text snippets.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in doc_element.text_anchor.text_segments:
        start_index = (
            int(segment.start_index)
            if segment in doc_element.text_anchor.text_segments
            else 0
        )
        end_index = int(segment.end_index)
        response += document.text[start_index:end_index]
    return response

In [None]:
# Display Form Data 
# Form data is only detected by some processors such as those that use the General Form parser.
document_pages = document.pages
print("Form data detected:\n")
# For each page fetch each form field and display fieldname, value and confidence scores
for page in document_pages:
    print("Page Number:{}".format(page.page_number))
    for form_field in page.form_fields:
        fieldName=get_text(form_field.field_name,document)
        nameConfidence = round(form_field.field_name.confidence,4)
        fieldValue = get_text(form_field.field_value,document)
        valueConfidence = round(form_field.field_value.confidence,4)
        print(fieldName+fieldValue +"  (Confidence Scores: (Name) "+str(nameConfidence)+", (Value) "+str(valueConfidence)+")\n")


In [None]:
# Display Entity Data
# Entity data is only detected by specialized parsers, such as the Procurement Expense parser.
# For each entity print the key/value pair and their corresponding confidence scores.
if 'entities' in dir(document):
    entities=document.entities
    table = PrettyTable(['Type', 'Value', 'Confidence'])
    entities_found = 0
    for entity in entities:
       entity_type = entity.type_
       value = entity.mention_text
       confidence = round(entity.confidence,4)
       table.add_row([entity_type, value, confidence])
    print(table)       
else:
    print("Document does not contain entity data.")