In [1]:
!pip install google-cloud-documentai



In [2]:
#import necessary files
from google.colab import auth
from typing import Optional, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
import sys

In [3]:
# Authenticate with Google Cloud
auth.authenticate_user()

In [4]:
#this info will be extracted from google cloud
project_id = "attendanceextractor"
location = "us"                                             # Format is "us" or "eu"
processor_id = "12bf41b9e4b98255"                           # Create processor before running sample
processor_version = "rc"                                    # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
file_path = "/content/Copy of MAHENDRAGARH_NEO_121023_4.jpeg"
mime_type = "image/jpeg"                                    # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types

In [5]:
#changed the return type to dict i.e. 'document' is a dictionary datatype
def process_document_form_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> dict:

    # Online processing request to Document AI
    document = process_document(
        project_id, location, processor_id, processor_version, file_path, mime_type
    )

    # Read the table and form fields output from the processor
    # The form processor also contains OCR data. For more information
    # on how to parse OCR data please see the OCR sample.

    text = document.text
    # print(f"Full document text: {repr(text)}\n")
    print(f"There are {len(document.pages)} page(s) in this document.")
    for page in document.pages:
      print(f"\n\n**** Page {page.page_number} ****")

      #print(f"\nFound {len(page.tables)} table(s):")
      for table in page.tables:
        num_columns = len(table.header_rows[0].cells)
        num_rows = len(table.body_rows)
        #print(f"Table with {num_columns} columns and {num_rows} rows:")

        # Print header rows only if it's the first table on the page
        if table == page.tables[0]:
            print("Columns:")
            header_row_text = ""
            for cell in table.header_rows[0].cells:
                cell_text = layout_to_text(cell.layout, text)
                header_row_text += f"{repr(cell_text.strip())} | " if cell_text.strip() else "'' | "
            print(header_row_text)

        # Print body rows
        #print("Table body data:")
        for table_row in table.body_rows:
            row_text = ""
            for cell in table_row.cells:
                cell_text = layout_to_text(cell.layout, text)
                row_text += f"{repr(cell_text.strip())} | "

            # Output format for each row
            formatted_output = {
                header_cell.strip(): row_cell.strip()
                for header_cell, row_cell in zip(header_row_text.split('|'), row_text.split('|'))
            }

            # Print formatted output with line breaks
            print("\n".join([f"{key}: {value}" for key, value in formatted_output.items()]))

        for field in page.form_fields:
            name = layout_to_text(field.field_name, text)
            value = layout_to_text(field.field_value, text)
            print(f"    * {repr(name.strip())}: {repr(value.strip())}")

        return document

## Functions for processing extracted text

In [6]:
def print_table_rows(
    table_rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
) -> None:
    for table_row in table_rows:
        row_text = ""
        for cell in table_row.cells:
            cell_text = layout_to_text(cell.layout, text)
            row_text += f"{repr(cell_text.strip())} | "
        print(row_text)

In [7]:
#this function is not called in the current code, but can be used to calculate confidence scores to see how our model performs
def print_entity(entity: documentai.Document.Entity) -> None:
    # Fields detected. For a full list of fields for each processor see
    # the processor documentation:
    # https://cloud.google.com/document-ai/docs/processors-list
    key = entity.type_

    # Some other value formats in addition to text are availible
    # e.g. dates: `entity.normalized_value.date_value.year`
    text_value = entity.text_anchor.content
    confidence = entity.confidence
    normalized_value = entity.normalized_value.text
    print(f"    * {repr(key)}: {repr(text_value)}({confidence:.1%} confident)")

    if normalized_value:
        print(f"    * Normalized Value: {repr(normalized_value)}")

In [8]:
def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document

In [9]:
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

In [59]:
# prompt: #run process_document_form_sample
#to see the output format
doc_output = process_document_form_sample(
                     project_id, location, processor_id, processor_version, file_path, mime_type,)

There are 1 page(s) in this document.


**** Page 1 ****
Columns:
'SESSION\nDATE' | 'SESSION\nLOCATION' | 'SESSION\nCONDUCTED\nBY NAME' | 'ANC' | 'CLASS' | 'CLASS TYPE\nSNCU' | 'NBSU' | 'PEOPLE\nTRAINED' | 'PHOTO\nSENT' | 
'SESSION\nDATE': '9/08/23'
'SESSION\nLOCATION': 'PNC ward'
'SESSION\nCONDUCTED\nBY NAME': 'Mlo- Sarmila'
'ANC': ''
'CLASS': ''
'CLASS TYPE\nSNCU': ''
'NBSU': ''
'PEOPLE\nTRAINED': '42'
'PHOTO\nSENT': 'L'
: 
'SESSION\nDATE': '11/8/23'
'SESSION\nLOCATION': 'PNC Ward'
'SESSION\nCONDUCTED\nBY NAME': 'No.\nReena\nsapna'
'ANC': 'COO'
'CLASS': '~'
'CLASS TYPE\nSNCU': ''
'NBSU': ''
'PEOPLE\nTRAINED': ''
'PHOTO\nSENT': ''
: 
'SESSION\nDATE': '12/8/23'
'SESSION\nLOCATION': 'PNC ward'
'SESSION\nCONDUCTED\nBY NAME': 'Mo-Reeng'
'ANC': ''
'CLASS': ''
'CLASS TYPE\nSNCU': ''
'NBSU': '00'
'PEOPLE\nTRAINED': '30'
'PHOTO\nSENT': ''
: 
'SESSION\nDATE': '14/8/2'
'SESSION\nLOCATION': 'PNC ward'
'SESSION\nCONDUCTED\nBY NAME': 'Mo- Lagwanti\nPinki, Reena'
'ANC': ''
'CLASS': ''
'CLASS TYPE\n

### Raw txt file

In [None]:
#store the original results in a text file

# Redirect stdout to a file
with open("text_output.txt", "w") as f:
    sys.stdout = f  # This will redirect print statements to the file

    #processing and printing the document
    process_document_form_sample(
      project_id,
      location,
      processor_id,
      processor_version,
      file_path,
      mime_type,
    )

    # Restore stdout
    sys.stdout = sys.__stdout__

# Print a message indicating the file has been created
print("Output has been saved to text_output.txt")


In [20]:
#To convert doc_output to dict format

def convert_document_to_dictionary(document):

    converted_dict = {"pages": []}

    for page in document.pages:
        page_dict = {
            "page_number": page.page_number,
            "tables": [],
            "form_fields": [],
        }

        for table in page.tables:
            table_dict = {
                "columns": [layout_to_text(cell.layout, document.text).strip() for cell in table.header_rows[0].cells],
                "rows": [
                    [layout_to_text(cell.layout, document.text).strip() for cell in row.cells]
                    for row in table.body_rows
                ],
            }
            page_dict["tables"].append(table_dict)

        for field in page.form_fields:
            name = layout_to_text(field.field_name, document.text).strip()
            value = layout_to_text(field.field_value, document.text).strip()
            page_dict["form_fields"].append({name: value})

        converted_dict["pages"].append(page_dict)

    return converted_dict


In [22]:
# Converted_output is doc_output in dictionary format
converted_output = convert_document_to_dictionary(doc_output)

print(converted_output)

{'pages': [{'page_number': 1, 'tables': [{'columns': ['SESSION\nDATE', 'SESSION\nLOCATION', 'SESSION\nCONDUCTED\nBY NAME', 'ANC', 'CLASS', 'CLASS TYPE\nSNCU', 'NBSU', 'PEOPLE\nTRAINED', 'PHOTO\nSENT'], 'rows': [['9/08/23', 'PNC ward', 'Mlo- Sarmila', '', '', '', '', '42', 'L'], ['11/8/23', 'PNC Ward', 'No.\nReena\nsapna', 'COO', '~', '', '', '', ''], ['12/8/23', 'PNC ward', 'Mo-Reeng', '', '', '', '00', '30', ''], ['14/8/2', 'PNC ward', 'Mo- Lagwanti\nPinki, Reena', '', '', '', '', '48', ''], ['16/8/23', 'PMCward', 'Sharmib,\nلسلف', '', '', '', '', '255er\n46', ''], ['21/8/23', 'PNC ward', 'Sharmila\nReang...', '', 'L', '', '', '45', ''], ['22/8/23', 'PNC ward', 'Sujata.', '', '', '', '', '42', ''], ['23/8/23', 'ANG', 'Safana.', '', '', '', '', '40', ''], ['24/8/23', 'New', 'vlo Kavita\nReena', '', '', '', '', '40', '']]}, {'columns': ['25/8/20', 'P.NC', 'Roly wants\n180 Kavit', '45', ''], 'rows': [['26/8/23', 'P.N.C.', 'Mo Lawanti\nAnita.', '42', ''], ['30/8/23', 'PMC', 'Laranti\nAle\

In [47]:
#convert dictionary to json type keeping all the columns as extracted

def json_structure(dictionary):
    structured_output = {"pages": []}

    for page in dictionary['pages']:
        current_page = {'page_number': page['page_number'], 'tables': []}

        for table_index, table in enumerate(page['tables']):
            if table_index == 0:
                current_page['tables'].append({'columns': table['columns'], 'rows': []})
            else:
                current_page['tables'][0]['rows'].append(table['columns'])

            for row in table['rows']:
                current_page['tables'][0]['rows'].append(row)

        current_page['form_fields'] = page['form_fields']
        structured_output['pages'].append(current_page)

    return structured_output

In [61]:
structured_json = json_structure(converted_output)
print(structured_json)

{'pages': [{'page_number': 1, 'tables': [{'columns': ['SESSION\nDATE', 'SESSION\nLOCATION', 'SESSION\nCONDUCTED\nBY NAME', 'ANC', 'CLASS', 'CLASS TYPE\nSNCU', 'NBSU', 'PEOPLE\nTRAINED', 'PHOTO\nSENT'], 'rows': [['9/08/23', 'PNC ward', 'Mlo- Sarmila', '', '', '', '', '42', 'L'], ['11/8/23', 'PNC Ward', 'No.\nReena\nsapna', 'COO', '~', '', '', '', ''], ['12/8/23', 'PNC ward', 'Mo-Reeng', '', '', '', '00', '30', ''], ['14/8/2', 'PNC ward', 'Mo- Lagwanti\nPinki, Reena', '', '', '', '', '48', ''], ['16/8/23', 'PMCward', 'Sharmib,\nلسلف', '', '', '', '', '255er\n46', ''], ['21/8/23', 'PNC ward', 'Sharmila\nReang...', '', 'L', '', '', '45', ''], ['22/8/23', 'PNC ward', 'Sujata.', '', '', '', '', '42', ''], ['23/8/23', 'ANG', 'Safana.', '', '', '', '', '40', ''], ['24/8/23', 'New', 'vlo Kavita\nReena', '', '', '', '', '40', ''], ['25/8/20', 'P.NC', 'Roly wants\n180 Kavit', '45', ''], ['26/8/23', 'P.N.C.', 'Mo Lawanti\nAnita.', '42', ''], ['30/8/23', 'PMC', 'Laranti\nAle\nPoonam', '327\nчо', ''

In [58]:
#restructuring structured_json to keep only required columns

def restructure_json(original_json):
    restructured_data = {'pages': []}

    for page in original_json['pages']:
        restructured_page = {'page_number': page['page_number'], 'tables': []}

        for table in page['tables']:
            restructured_table = {'columns': table['columns'][:3] + ['PeopleTrained'], 'rows': []}

            for row in table['rows']:
                session_date, session_location, conducted_by_name, *rest = row
                people_trained = next((value for value in rest if value), None)

                restructured_row = [session_date, session_location, conducted_by_name, people_trained]
                restructured_table['rows'].append(restructured_row)

            restructured_page['tables'].append(restructured_table)

        for form_field, value in page['form_fields'][0].items():
            restructured_page['form_fields'] = [{form_field: value}]

        restructured_data['pages'].append(restructured_page)

    return restructured_data

# Example Usage:
# Assuming you have the 'converted_output' as provided in your message
restructured_json = restructure_json(structured_json)

# Print the restructured data
print(restructured_json)


{'pages': [{'page_number': 1, 'tables': [{'columns': ['SESSION\nDATE', 'SESSION\nLOCATION', 'SESSION\nCONDUCTED\nBY NAME', 'PeopleTrained'], 'rows': [['9/08/23', 'PNC ward', 'Mlo- Sarmila', '42'], ['11/8/23', 'PNC Ward', 'No.\nReena\nsapna', 'COO'], ['12/8/23', 'PNC ward', 'Mo-Reeng', '00'], ['14/8/2', 'PNC ward', 'Mo- Lagwanti\nPinki, Reena', '48'], ['16/8/23', 'PMCward', 'Sharmib,\nلسلف', '255er\n46'], ['21/8/23', 'PNC ward', 'Sharmila\nReang...', 'L'], ['22/8/23', 'PNC ward', 'Sujata.', '42'], ['23/8/23', 'ANG', 'Safana.', '40'], ['24/8/23', 'New', 'vlo Kavita\nReena', '40'], ['25/8/20', 'P.NC', 'Roly wants\n180 Kavit', '45'], ['26/8/23', 'P.N.C.', 'Mo Lawanti\nAnita.', '42'], ['30/8/23', 'PMC', 'Laranti\nAle\nPoonam', '327\nчо'], ['31/8/27', 'PNC', 'N/- SujataKavita', '43'], ['1/09/23', 'PNC', 'Nh-Shamile', None], ['', '', 'Lajwanti\nPoonam', '383\n38'], ['02/09/27', 'PMC', 'Na Nirmala\n·Poo ham', '46'], ['04/9/23', 'Pnc', 'Sharmil\nNo.\nfornen.', '42'], ['4/9/23', 'PNC', 'W/o Reen

In [64]:
#function to use restructured_json to make dictionary of required structure
def structured_dictionary(final_json):
    result = []

    for page in final_json['pages']:
        for table in page['tables']:
            for row in table['rows']:
                session_date, session_location, conducted_by_name, people_trained = row
                entry_dict = {
                    'SessionDate': session_date,
                    'SessionLocation': session_location,
                    'ConductedByName': conducted_by_name,
                    'PeopleTrained': people_trained
                }
                result.append(entry_dict)

    return result

#using restructured_json to get final dictionary
Dictionary = structured_dictionary(restructured_json)

print(Dictionary)


[{'SessionDate': '9/08/23', 'SessionLocation': 'PNC ward', 'ConductedByName': 'Mlo- Sarmila', 'PeopleTrained': '42'}, {'SessionDate': '11/8/23', 'SessionLocation': 'PNC Ward', 'ConductedByName': 'No.\nReena\nsapna', 'PeopleTrained': 'COO'}, {'SessionDate': '12/8/23', 'SessionLocation': 'PNC ward', 'ConductedByName': 'Mo-Reeng', 'PeopleTrained': '00'}, {'SessionDate': '14/8/2', 'SessionLocation': 'PNC ward', 'ConductedByName': 'Mo- Lagwanti\nPinki, Reena', 'PeopleTrained': '48'}, {'SessionDate': '16/8/23', 'SessionLocation': 'PMCward', 'ConductedByName': 'Sharmib,\nلسلف', 'PeopleTrained': '255er\n46'}, {'SessionDate': '21/8/23', 'SessionLocation': 'PNC ward', 'ConductedByName': 'Sharmila\nReang...', 'PeopleTrained': 'L'}, {'SessionDate': '22/8/23', 'SessionLocation': 'PNC ward', 'ConductedByName': 'Sujata.', 'PeopleTrained': '42'}, {'SessionDate': '23/8/23', 'SessionLocation': 'ANG', 'ConductedByName': 'Safana.', 'PeopleTrained': '40'}, {'SessionDate': '24/8/23', 'SessionLocation': 'New

In [51]:
import json
#to save restructured_json in json format
def save_to_json(structured_data, filename='outputjson.json'):
    with open(filename, 'w') as json_file:
        json.dump(structured_data, json_file, indent=2)

In [65]:
#restructured_json saved to outputjson.json
save_to_json(Dictionary)