In [1]:
!pip install --upgrade google-cloud-documentai google-cloud-storage

Collecting google-cloud-documentai
  Downloading google_cloud_documentai-3.7.0-py3-none-any.whl.metadata (9.8 kB)
Downloading google_cloud_documentai-3.7.0-py3-none-any.whl (303 kB)
Installing collected packages: google-cloud-documentai
Successfully installed google-cloud-documentai-3.7.0


In [None]:
# DOCUMENT OCR

In [11]:
## ONLINE PROCESSING

In [2]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.cloud import storage

def online_process_document(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    """
    Processes a document using the Document AI online processing API.
    """
    # Instantiates a client
    docai_client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor
    resource_name = docai_client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(
        content=image_content, mime_type=mime_type
    )

    # Configure the process request
    request = documentai.ProcessRequest(
        name=resource_name,
        raw_document=raw_document,
    )

    # Use the Document AI client to process the sample form
    result = docai_client.process_document(request=request)

    return result.document


def save_text_to_gcs(
    bucket_name: str, destination_blob_name: str, text_content: str
):
    """Uploads a string to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_string(text_content)

    print(f"File {destination_blob_name} uploaded to {bucket_name}.")


# TODO(developer): Set these variables before running the sample.
project_id = "qwiklabs-gcp-03-25b8e7edf7ec"
location = "us"  # e.g., "us" or "eu"
processor_id = "b617543603c0ec67"
file_path = "sample-online-ocr.pdf"
mime_type = "application/pdf"
bucket_name = "qwiklabs-gcp-03-25b8e7edf7ec-cepf-documentai"
destination_blob_name = "cepf_online_ocr.txt"


document = online_process_document(
    project_id=project_id,
    location=location,
    processor_id=processor_id,
    file_path=file_path,
    mime_type=mime_type,
)

save_text_to_gcs(
    bucket_name=bucket_name,
    destination_blob_name=destination_blob_name,
    text_content=document.text,
)



File cepf_online_ocr.txt uploaded to qwiklabs-gcp-03-25b8e7edf7ec-cepf-documentai.


In [12]:
## BATCH PROCESSING

In [13]:
### Initiate batch processing

In [8]:
from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import GoogleAPICallError
from google.cloud import documentai

def batch_process_documents(
    project_id: str,
    location: str,
    processor_id: str,
    gcs_input_uri: str,
    gcs_output_bucket: str,
    gcs_output_uri_prefix: str,
    timeout: int = 400,
) -> None:
    """
    Performs batch processing on a document in Cloud Storage.
    """
    # Instantiates a client
    docai_client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor
    resource_name = docai_client.processor_path(project_id, location, processor_id)

    # Configure the input documents
    gcs_document = documentai.GcsDocument(
        gcs_uri=gcs_input_uri, mime_type="application/pdf"
    )
    # Load GcsDocument into a GcsDocuments object
    gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
    input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)

    # Configure the output location
    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=f"{gcs_output_bucket}/{gcs_output_uri_prefix}"
    )
    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

    # Configure the process request
    request = documentai.BatchProcessRequest(
        name=resource_name,
        input_documents=input_config,
        document_output_config=output_config,
    )

    # Make the batch process request
    try:
        operation = docai_client.batch_process_documents(request)
        # Wait for the operation to complete
        print("Waiting for the batch processing operation to complete...")
        operation.result(timeout=timeout)
        print("Batch processing complete.")

    except GoogleAPICallError as e:
        print(f"An error occurred during batch processing: {e}")


# TODO(developer): Set these variables before running the sample.
project_id = "qwiklabs-gcp-03-25b8e7edf7ec"
location = "us"  # e.g., "us" or "eu"
processor_id = "b617543603c0ec67"
# This must be a URI to a file in GCS.
gcs_input_uri = "gs://qwiklabs-gcp-03-25b8e7edf7ec-cepf-documentai/sample-batch-ocr.pdf"
gcs_output_bucket = "gs://qwiklabs-gcp-03-25b8e7edf7ec-cepf-documentai"
gcs_output_uri_prefix = "batch-output" # A folder in your output bucket

batch_process_documents(
    project_id=project_id,
    location=location,
    processor_id=processor_id,
    gcs_input_uri=gcs_input_uri,
    gcs_output_bucket=gcs_output_bucket,
    gcs_output_uri_prefix=gcs_output_uri_prefix,
)

Waiting for the batch processing operation to complete...
Batch processing complete.


In [14]:
### Consolidate output and save

In [10]:
import json
from google.cloud import storage
from google.cloud import documentai

def get_and_save_batch_results(
    gcs_output_bucket: str,
    gcs_output_uri_prefix: str,
    destination_blob_name: str,
):
    """
    Parses the output from a Document AI batch process, consolidates the
    text, and saves it to a new GCS file.
    """
    storage_client = storage.Client()
    bucket_name = gcs_output_bucket.replace("gs://", "")
    bucket = storage_client.bucket(bucket_name)

    # List all blobs in the specified prefix
    blob_list = list(bucket.list_blobs(prefix=gcs_output_uri_prefix))
    
    full_text = []

    print("Parsing output files...")
    for blob in blob_list:
        # Document AI's batch output consists of multiple JSON files
        if ".json" in blob.name:
            # Download the JSON content
            json_string = blob.download_as_string()
            
            # Parse the JSON and extract the text
            document = documentai.Document.from_json(json_string, ignore_unknown_fields=True)
            
            # Add the text to our collection
            full_text.append(document.text)

    # Concatenate all text and upload to the final destination
    print(f"Uploading consolidated text to {destination_blob_name}...")
    output_blob = bucket.blob(destination_blob_name)
    output_blob.upload_from_string("".join(full_text))

    print("Consolidation and upload complete.")


# TODO(developer): Set these variables before running the sample.
# These should match the output settings from the first script.
gcs_output_bucket = "gs://qwiklabs-gcp-03-25b8e7edf7ec-cepf-documentai"
gcs_output_uri_prefix = "batch-output"
# The final filename for the consolidated text.
final_destination_blob_name = "cepf_batch_ocr.txt"

get_and_save_batch_results(
    gcs_output_bucket=gcs_output_bucket,
    gcs_output_uri_prefix=gcs_output_uri_prefix,
    destination_blob_name=final_destination_blob_name,
)


Parsing output files...
Uploading consolidated text to cepf_batch_ocr.txt...
Consolidation and upload complete.


In [15]:
# FORM PARSER

In [16]:
!pip install pandas



In [17]:
import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.cloud import storage
from typing import Sequence


def process_and_extract_form_data(
    project_id: str,
    location: str,
    processor_id: str,
    gcs_uri: str,
    output_bucket: str,
    output_filename: str,
):
    """
    Processes a form document using Document AI, extracts key-value pairs,
    and saves the result as a CSV in a Cloud Storage bucket.
    """
    # 1. Instantiate Document AI Client
    docai_client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # 2. Configure and make the processing request
    resource_name = docai_client.processor_path(project_id, location, processor_id)

    # Specify that the file is in GCS
    gcs_document = documentai.GcsDocument(
        gcs_uri=gcs_uri, mime_type="application/pdf"
    )

    request = documentai.ProcessRequest(
        name=resource_name, gcs_document=gcs_document
    )

    print("Making request to Document AI API...")
    result = docai_client.process_document(request=request)
    document = result.document
    print("Request complete. Parsing form fields...")

    # 3. Define a helper function to get text from the document
    def get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
        """Extract text from the document based on a text anchor."""
        if not text_anchor.text_segments:
            return ""
        start_index = int(text_anchor.text_segments[0].start_index)
        end_index = int(text_anchor.text_segments[-1].end_index)
        return text[start_index:end_index].strip()

    # 4. Extract key/value pairs from all pages
    form_data = []
    for page in document.pages:
        for field in page.form_fields:
            field_name = get_text(field.field_name.text_anchor, document.text)
            field_value = get_text(field.field_value.text_anchor, document.text)
            form_data.append({"key": field_name, "value": field_value})

    print(f"Found {len(form_data)} key-value pairs.")

    # 5. Use pandas to create a DataFrame
    df = pd.DataFrame(form_data)
    print("DataFrame created:")
    print(df.head())

    # 6. Save the DataFrame to a CSV file in Cloud Storage
    storage_client = storage.Client()
    bucket = storage_client.bucket(output_bucket)
    blob = bucket.blob(output_filename)

    # Convert DataFrame to CSV string and upload
    csv_data = df.to_csv(index=False)
    blob.upload_from_string(csv_data, content_type="text/csv")

    print(f"Successfully saved output to gs://{output_bucket}/{output_filename}")


# --- Main Execution ---
if __name__ == "__main__":
    # TODO(developer): Set these variables before running the sample.
    project_id = "qwiklabs-gcp-03-25b8e7edf7ec"
    location = "us"  # e.g., "us"
    processor_id = "33261d5ae6798457"
    # The GCS bucket provided in the prompt
    gcs_bucket = "qwiklabs-gcp-03-25b8e7edf7ec-cepf-documentai"
    gcs_uri = f"gs://{gcs_bucket}/sample-intake-form.pdf"
    output_filename = "cepf_form_parser.csv"

    process_and_extract_form_data(
        project_id=project_id,
        location=location,
        processor_id=processor_id,
        gcs_uri=gcs_uri,
        output_bucket=gcs_bucket,
        output_filename=output_filename,
    )

Making request to Document AI API...
Request complete. Parsing form fields...
Found 17 key-value pairs.
DataFrame created:
                                                 key  \
0  Are you currently taking any medication? (If y...   
1                                          _Phone #:   
2                                               Zip:   
3                                              City:   
4                                             State:   

                                 value  
0  Vyvanse (25mg) daily for attention.  
1     walker@cmail.com\n_Phone #: (906  
2                                07082  
3                               Towaco  
4                                   NJ  
Successfully saved output to gs://qwiklabs-gcp-03-25b8e7edf7ec-cepf-documentai/cepf_form_parser.csv


In [18]:
## Extract table data

In [20]:
import csv
import io
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.cloud import storage
from typing import Sequence


def extract_and_save_table_data(
    project_id: str,
    location: str,
    processor_id: str,
    gcs_uri: str,
    output_bucket_name: str,
):
    """
    Processes a document containing tables, extracts the data from each table,
    and saves each table as a separate CSV file in a GCS bucket.
    """
    # 1. Instantiate Document AI Client
    docai_client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # 2. Configure and make the processing request
    resource_name = docai_client.processor_path(project_id, location, processor_id)
    gcs_document = documentai.GcsDocument(
        gcs_uri=gcs_uri, mime_type="application/pdf"
    )
    request = documentai.ProcessRequest(
        name=resource_name, gcs_document=gcs_document
    )

    print("Making request to Document AI API...")
    result = docai_client.process_document(request=request)
    document = result.document
    print("Request complete. Parsing tables...")

    # 3. Define a helper function to get text from the document
    def get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
        """Extract text from the document based on a text anchor."""
        if not text_anchor.text_segments:
            return ""
        start_index = int(text_anchor.text_segments[0].start_index)
        end_index = int(text_anchor.text_segments[-1].end_index)
        # Replace newlines and tabs with spaces, and strip leading/trailing whitespace
        return text[start_index:end_index].replace("\n", " ").replace("\t", " ").strip()

    # 4. Iterate through pages and tables to extract data
    storage_client = storage.Client()
    output_bucket = storage_client.bucket(output_bucket_name)

    for page_num, page in enumerate(document.pages):
        for table_num, table in enumerate(page.tables):
            print(f"Parsing table {table_num} on page {page_num + 1}...")

            # Extract header rows
            header_rows = [
                [get_text(cell.layout.text_anchor, document.text) for cell in row.cells]
                for row in table.header_rows
            ]

            # Extract body rows
            body_rows = [
                [get_text(cell.layout.text_anchor, document.text) for cell in row.cells]
                for row in table.body_rows
            ]
            
            # Combine all rows
            all_rows = header_rows + body_rows

            # 5. Create CSV content in memory
            csv_output = io.StringIO()
            writer = csv.writer(csv_output)
            writer.writerows(all_rows)

            # 6. Upload the CSV to Cloud Storage
            # Per the prompt, the output filename is specific to the first table (tb0).
            if table_num == 0:
                output_filename = f"sample-form-with-table-tb{table_num}.csv"
                blob = output_bucket.blob(output_filename)
                blob.upload_from_string(
                    csv_output.getvalue(), content_type="text/csv"
                )
                print(
                    f"Successfully saved table {table_num} to gs://{output_bucket_name}/{output_filename}"
                )
    
    if not document.pages or not document.pages[0].tables:
        print("No tables found in the document.")

# --- Main Execution ---
if __name__ == "__main__":
    # TODO(developer): Set these variables before running the sample.
    project_id = "qwiklabs-gcp-03-25b8e7edf7ec"
    location = "us"  # e.g., "us"
    # It's recommended to use a Form Parser processor for this task.
    processor_id = "33261d5ae6798457"

    # GCS bucket and file as specified in the prompt
    gcs_bucket_name = "qwiklabs-gcp-03-25b8e7edf7ec-cepf-documentai"
    gcs_uri = f"gs://{gcs_bucket_name}/sample-form-with-table.pdf"

    extract_and_save_table_data(
        project_id=project_id,
        location=location,
        processor_id=processor_id,
        gcs_uri=gcs_uri,
        output_bucket_name=gcs_bucket_name,
    )

Making request to Document AI API...
Request complete. Parsing tables...
Parsing table 0 on page 1...
Successfully saved table 0 to gs://qwiklabs-gcp-03-25b8e7edf7ec-cepf-documentai/sample-form-with-table-tb0.csv
