In [None]:
!pip install google-cloud-documentai

Collecting google-cloud-documentai
  Downloading google_cloud_documentai-2.21.1-py2.py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: google-cloud-documentai
Successfully installed google-cloud-documentai-2.21.1


In [None]:
#import necessary files
from google.colab import auth
from typing import Optional, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
import sys

In [None]:
# Authenticate with Google Cloud
auth.authenticate_user()

In [None]:
#this info will be extracted from google cloud
project_id = "attendanceextractor"
location = "us"                                             # Format is "us" or "eu"
processor_id = "12bf41b9e4b98255"                           # Create processor before running sample
processor_version = "rc"                                    # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
file_path = "/content/MAHENDRAGARH_NEO_121023_4.jpeg"
mime_type = "image/jpeg"                                    # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types

In [None]:
def process_document_form_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:

    # Online processing request to Document AI
    document = process_document(
        project_id, location, processor_id, processor_version, file_path, mime_type
    )

    # Read the table and form fields output from the processor
    # The form processor also contains OCR data. For more information
    # on how to parse OCR data please see the OCR sample.

    text = document.text
    # print(f"Full document text: {repr(text)}\n")
    print(f"There are {len(document.pages)} page(s) in this document.")
    for page in document.pages:
      print(f"\n\n**** Page {page.page_number} ****")

      print(f"\nFound {len(page.tables)} table(s):")
      for table in page.tables:
        num_columns = len(table.header_rows[0].cells)
        num_rows = len(table.body_rows)
        print(f"Table with {num_columns} columns and {num_rows} rows:")

        # Print header rows only if it's the first table on the page
        if table == page.tables[0]:
            print("Columns:")
            header_row_text = ""
            for cell in table.header_rows[0].cells:
                cell_text = layout_to_text(cell.layout, text)
                header_row_text += f"{repr(cell_text.strip())} | " if cell_text.strip() else "'' | "
            print(header_row_text)

        # Print body rows
        print("Table body data:")
        for table_row in table.body_rows:
            row_text = ""
            for cell in table_row.cells:
                cell_text = layout_to_text(cell.layout, text)
                row_text += f"{repr(cell_text.strip())} | "

            # Output format for each row
            formatted_output = {
                header_cell.strip(): row_cell.strip()
                for header_cell, row_cell in zip(header_row_text.split('|'), row_text.split('|'))
            }

            # Print formatted output with line breaks
            print("\n".join([f"{key}: {value}" for key, value in formatted_output.items()]))

        for field in page.form_fields:
            name = layout_to_text(field.field_name, text)
            value = layout_to_text(field.field_value, text)
            print(f"    * {repr(name.strip())}: {repr(value.strip())}")

In [None]:
def print_table_rows(
    table_rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
) -> None:
    for table_row in table_rows:
        row_text = ""
        for cell in table_row.cells:
            cell_text = layout_to_text(cell.layout, text)
            row_text += f"{repr(cell_text.strip())} | "
        print(row_text)

In [None]:
#this function is not called in the current code, but can be used to calculate confidence scores to see how our model performs
def print_entity(entity: documentai.Document.Entity) -> None:
    # Fields detected. For a full list of fields for each processor see
    # the processor documentation:
    # https://cloud.google.com/document-ai/docs/processors-list
    key = entity.type_

    # Some other value formats in addition to text are availible
    # e.g. dates: `entity.normalized_value.date_value.year`
    text_value = entity.text_anchor.content
    confidence = entity.confidence
    normalized_value = entity.normalized_value.text
    print(f"    * {repr(key)}: {repr(text_value)}({confidence:.1%} confident)")

    if normalized_value:
        print(f"    * Normalized Value: {repr(normalized_value)}")

In [None]:
def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document

In [None]:
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

In [None]:
# prompt: #run process_document_form_sample
#to see the output format
process_document_form_sample(
    project_id,
    location,
    processor_id,
    processor_version,
    file_path,
    mime_type,
)

There are 1 page(s) in this document.


**** Page 1 ****

Found 2 table(s):
Table with 9 columns and 9 rows:
Columns:
'SESSION\nDATE' | 'SESSION\nLOCATION' | 'SESSION\nCONDUCTED\nBY NAME' | 'ANC' | 'CLASS' | 'CLASS TYPE\nSNCU' | 'NBSU' | 'PEOPLE\nTRAINED' | 'PHOTO\nSENT' | 
Table body data:
'SESSION\nDATE': '9/08/23'
'SESSION\nLOCATION': 'PNC ward'
'SESSION\nCONDUCTED\nBY NAME': 'Mlo- Sarmila'
'ANC': ''
'CLASS': ''
'CLASS TYPE\nSNCU': ''
'NBSU': ''
'PEOPLE\nTRAINED': '42'
'PHOTO\nSENT': 'L'
: 
'SESSION\nDATE': '11/8/23'
'SESSION\nLOCATION': 'PNC Ward'
'SESSION\nCONDUCTED\nBY NAME': 'No.\nReena\nsapna'
'ANC': 'COO'
'CLASS': '~'
'CLASS TYPE\nSNCU': ''
'NBSU': ''
'PEOPLE\nTRAINED': ''
'PHOTO\nSENT': ''
: 
'SESSION\nDATE': '12/8/23'
'SESSION\nLOCATION': 'PNC ward'
'SESSION\nCONDUCTED\nBY NAME': 'Mo-Reeng'
'ANC': ''
'CLASS': ''
'CLASS TYPE\nSNCU': ''
'NBSU': '00'
'PEOPLE\nTRAINED': '30'
'PHOTO\nSENT': ''
: 
'SESSION\nDATE': '14/8/2'
'SESSION\nLOCATION': 'PNC ward'
'SESSION\nCONDUCTED\nBY NA

In [None]:
#store the results in a text file

# Redirect stdout to a file
with open("text_output.txt", "w") as f:
    sys.stdout = f  # This will redirect print statements to the file

    #processing and printing the document
    process_document_form_sample(
      project_id,
      location,
      processor_id,
      processor_version,
      file_path,
      mime_type,
    )

    # Restore stdout
    sys.stdout = sys.__stdout__

# Print a message indicating the file has been created
print("Output has been saved to text_output.txt")
