In [None]:
from google.cloud import storage

def download_pdf_from_gcs(bucket_name, source_blob_name, destination_file_name):
    # Initialize client
    storage_client = storage.Client()

    # Get bucket
    bucket = storage_client.bucket(bucket_name)

    # Get blob (file)
    blob = bucket.blob(source_blob_name)

    # Download file
    blob.download_to_filename(destination_file_name)

    print(f"Downloaded '{source_blob_name}' to '{destination_file_name}'.")

# Example usage:
bucket_name = "qwiklabs-gcp-00-67ea0aa744cc-cepf-documentai" #Change Bucket Name Here
source_blob_name = "sample-online-ocr.pdf"  
destination_file_name = "sample-online-ocr.pdf"

download_pdf_from_gcs(bucket_name, source_blob_name, destination_file_name)


Downloaded 'sample-online-ocr.pdf' to 'sample-online-ocr.pdf'.


In [3]:
pip install google-cloud-documentai

Collecting google-cloud-documentai
  Downloading google_cloud_documentai-3.7.0-py3-none-any.whl.metadata (9.8 kB)
Downloading google_cloud_documentai-3.7.0-py3-none-any.whl (303 kB)
Installing collected packages: google-cloud-documentai
Successfully installed google-cloud-documentai-3.7.0
Note: you may need to restart the kernel to use updated packages.


In [None]:

from google.cloud import documentai_v1 as documentai

client = documentai.DocumentProcessorServiceClient()

parent = client.common_location_path("qwiklabs-gcp-00-67ea0aa744cc", "us")  #Change Project Id Here

request = documentai.CreateProcessorRequest(
    parent=parent,
    processor=documentai.Processor(
        display_name="cepf-ocr",
        type_="OCR_PROCESSOR"
    ),
)

processor = client.create_processor(request=request)

print("Created OCR processor:")
print(processor.name)


Created OCR processor:
projects/688996142043/locations/us/processors/6615d38699ea6513


In [None]:

from typing import Optional

from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore




def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if processor_version_id:
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:

        name = client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load binary data
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

    process_options = documentai.ProcessOptions(
        # Process only specific pages
        individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
            pages=[1]
        )
    )

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=raw_document,
        field_mask=field_mask,
        process_options=process_options,
    )

    result = client.process_document(request=request)

    document = result.document

    # Read the text recognition output from the processor
    print("The document contains the following text:")
    print(document.text)


process_document_sample(
  project_id="qwiklabs-gcp-00-67ea0aa744cc", # Change Project Id
  location="us",
  processor_id="6615d38699ea6513",           # Change Processor Id. You will get this from output of above cell
  file_path="sample-online-ocr.pdf",
  mime_type="application/pdf",
)

The document contains the following text:
CHAPTER I
IN WHICH We Are Introduced to
Winnie-the-Pooh and Some
Bees, and the Stories Begin
HERE is Edward Bear, coming
downstairs now, bump, bump, bump, on the back
of his head, behind Christopher Robin. It is, as far
as he knows, the only way of coming downstairs,
but sometimes he feels that there really is another
way, if only he could stop bumping for a moment
and think of it. And then he feels that perhaps there
isn't. Anyhow, here he is at the bottom, and ready
to be introduced to you. Winnie-the-Pooh.
When I first heard his name, I said, just as you
are going to say, “But I thought he was a boy?"
“So did I," said Christopher Robin.
"Then you can't call him Winnie?"
"I don't."
"But you said-
"
"He's Winnie-ther-Pooh. Don't you know what
'ther' means?"
I
Digitized by
Google



In [None]:
from google.cloud import documentai_v1 as documentai
from google.cloud import storage
from google.api_core.client_options import ClientOptions

def process_document_and_save_to_gcs(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    gcs_bucket: str,
    output_filename: str
):

    # Set endpoint for non-us locations
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    # Document AI client
    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    name = client.processor_path(project_id, location, processor_id)

    # Read the PDF file
    with open(file_path, "rb") as f:
        pdf_content = f.read()

    raw_document = documentai.RawDocument(
        content=pdf_content, 
        mime_type="application/pdf"
    )

    # OCR request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=raw_document
    )

    result = client.process_document(request=request)
    document = result.document

    # Extracted text
    extracted_text = document.text

    # Save to Cloud Storage
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(gcs_bucket)
    blob = bucket.blob(output_filename)

    blob.upload_from_string(extracted_text, content_type="text/plain")

    print(f"Saved OCR text to gs://{gcs_bucket}/{output_filename}")


# ----------------------------
# Run the function
# ----------------------------

process_document_and_save_to_gcs(
    project_id="qwiklabs-gcp-00-67ea0aa744cc",  # Change Project Id Here
    location="us",
    processor_id="6615d38699ea6513",            # Change Processor Id
    file_path="sample-online-ocr.pdf",
    gcs_bucket="qwiklabs-gcp-00-67ea0aa744cc-cepf-documentai",  # Change GCS Bucket Id
    output_filename="cepf_online_ocr.txt"
)


Saved OCR text to gs://qwiklabs-gcp-00-67ea0aa744cc-cepf-documentai/cepf_online_ocr.txt


In [None]:
import json
from google.cloud import documentai_v1 as documentai
from google.cloud import storage

# ---------------------------
# Configuration
# ---------------------------
PROJECT_ID = "qwiklabs-gcp-00-67ea0aa744cc" # Change Project Id Here
LOCATION = "us"
PROCESSOR_ID = "6615d38699ea6513"           # Change Processor Id

INPUT_URI = f"gs://{PROJECT_ID}-cepf-documentai/sample-batch-ocr.pdf"
OUTPUT_URI = f"gs://{PROJECT_ID}-cepf-documentai/"

FINAL_OUTPUT_FILE = "cepf_batch_ocr.txt"

# ---------------------------
# Batch processing function
# ---------------------------
def batch_process_pdf(project_id, location, processor_id, input_uri, output_uri):
    client = documentai.DocumentProcessorServiceClient(
        client_options={"api_endpoint": f"{location}-documentai.googleapis.com"}
    )

    name = client.processor_path(project_id, location, processor_id)

    request = documentai.BatchProcessRequest(
        name=name,
        input_documents=documentai.BatchDocumentsInputConfig(
            gcs_documents=documentai.GcsDocuments(
                documents=[documentai.GcsDocument(gcs_uri=input_uri, mime_type="application/pdf")]
            )
        ),
        document_output_config=documentai.DocumentOutputConfig(
            gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
                gcs_uri=output_uri
            )
        )
    )

    operation = client.batch_process_documents(request)
    print("Submitting batch OCR request...")
    operation.result(timeout=600)  # increase timeout for large PDFs
    print("Batch OCR processing complete!")
# ---------------------------
# Extract text from batch output
# ---------------------------
def extract_output_and_save(output_gcs_path, final_output_filename):
    storage_client = storage.Client()
    bucket_name = output_gcs_path.split("/")[2]
    prefix = "/".join(output_gcs_path.split("/")[3:])

    bucket = storage_client.bucket(bucket_name)

    blobs = list(bucket.list_blobs(prefix=prefix))
    json_files = [b for b in blobs if b.name.endswith(".json")]

    full_text = ""

    for blob in json_files:
        content = blob.download_as_text()
        data = json.loads(content)
        # Batch output has 'documents' list
        for doc in data.get("documents", []):
            full_text += doc.get("text", "") + "\n"

    # Save combined text back to Cloud Storage
    output_blob = bucket.blob(final_output_filename)
    output_blob.upload_from_string(full_text, content_type="text/plain")
    print(f"Saved final OCR text to gs://{bucket_name}/{final_output_filename}")

# ---------------------------
# Run the workflow
# ---------------------------
batch_process_pdf(PROJECT_ID, LOCATION, PROCESSOR_ID, INPUT_URI, OUTPUT_URI)
extract_output_and_save(OUTPUT_URI, FINAL_OUTPUT_FILE)

Submitting batch OCR request...
Batch OCR processing complete!
Saved final OCR text to gs://qwiklabs-gcp-00-67ea0aa744cc-cepf-documentai/cepf_batch_ocr.txt


In [None]:
from google.cloud import documentai_v1 as documentai

project_id = "qwiklabs-gcp-00-67ea0aa744cc" # Change Project Id Here
location = "us"  
processor_type = "FORM_PARSER_PROCESSOR" 
processor_display_name = "cepf-form-parser"

client = documentai.DocumentProcessorServiceClient()

parent = f"projects/{project_id}/locations/{location}"

processor = documentai.Processor(
    type_=processor_type,
    display_name=processor_display_name
)

created_processor = client.create_processor(
    parent=parent,
    processor=processor
)

print("Created processor:")
print(f"Name: {created_processor.name}")
print(f"Type: {created_processor.type_}")
print(f"ID: {created_processor.name.split('/')[-1]}")


Created processor:
Name: projects/688996142043/locations/us/processors/651a4f839a410be1
Type: FORM_PARSER_PROCESSOR
ID: 651a4f839a410be1


In [None]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud import storage
import pandas as pd

# ---------------------------
# Config
# ---------------------------
PROJECT_ID = "qwiklabs-gcp-00-67ea0aa744cc"     # Change Project Id Here
LOCATION = "us"
PROCESSOR_ID = "651a4f839a410be1"               # Change Processor Id Here - Find Output of Above Cell
INPUT_URI = f"gs://{PROJECT_ID}-cepf-documentai/sample-intake-form.pdf"
OUTPUT_BUCKET = f"{PROJECT_ID}-cepf-documentai"
OUTPUT_FILENAME = "cepf_form_parser.csv"

# ---------------------------
# Initialize clients
# ---------------------------
opts = ClientOptions(api_endpoint=f"{LOCATION}-documentai.googleapis.com")
docai_client = documentai.DocumentProcessorServiceClient(client_options=opts)
storage_client = storage.Client(project=PROJECT_ID)

# ---------------------------
# Load PDF from GCS
# ---------------------------
bucket_name, blob_name = INPUT_URI.replace("gs://", "").split("/", 1)
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
pdf_bytes = blob.download_as_bytes()

# ---------------------------
# Process document
# ---------------------------
name = docai_client.processor_path(PROJECT_ID, LOCATION, PROCESSOR_ID)
raw_document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
request = documentai.ProcessRequest(name=name, raw_document=raw_document)
response = docai_client.process_document(request=request)

document = response.document
full_text = document.text  # complete text of the document

# ---------------------------
# Extract key/value pairs
# ---------------------------
data = []
for page in document.pages:
    for field in page.form_fields:
        # Extract key text
        key_anchor = field.field_name.text_anchor
        key_text = "".join(
            [full_text[int(seg.start_index):int(seg.end_index)] for seg in key_anchor.text_segments]
        ).strip()

        # Extract value text
        value_anchor = field.field_value.text_anchor
        value_text = "".join(
            [full_text[int(seg.start_index):int(seg.end_index)] for seg in value_anchor.text_segments]
        ).strip()

        data.append([key_text, value_text])

# Create DataFrame
df = pd.DataFrame(data, columns=["key", "value"])

# ---------------------------
# Save to GCS
# ---------------------------
output_bucket = storage_client.bucket(OUTPUT_BUCKET)
output_blob = output_bucket.blob(OUTPUT_FILENAME)
output_blob.upload_from_string(df.to_csv(index=False), content_type="text/csv")

print(f"CSV saved to gs://{OUTPUT_BUCKET}/{OUTPUT_FILENAME}")


CSV saved to gs://qwiklabs-gcp-00-67ea0aa744cc-cepf-documentai/cepf_form_parser.csv


In [None]:
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud import storage
import pandas as pd

# ---------------------------
# Config
# ---------------------------
PROJECT_ID = "qwiklabs-gcp-00-67ea0aa744cc" # Change Project Id Here
LOCATION = "us"
PROCESSOR_ID = "651a4f839a410be1"           # Change Processor Id Here 
INPUT_URI = f"gs://{PROJECT_ID}-cepf-documentai/sample-form-with-table.pdf"
OUTPUT_BUCKET = f"{PROJECT_ID}-cepf-documentai"
OUTPUT_FILENAME = "sample-form-with-table-tb0.csv"

# ---------------------------
# Initialize clients
# ---------------------------
opts = ClientOptions(api_endpoint=f"{LOCATION}-documentai.googleapis.com")
docai_client = documentai.DocumentProcessorServiceClient(client_options=opts)
storage_client = storage.Client(project=PROJECT_ID)

# ---------------------------
# Load PDF from GCS
# ---------------------------
bucket_name, blob_name = INPUT_URI.replace("gs://", "").split("/", 1)
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
pdf_bytes = blob.download_as_bytes()

# ---------------------------
# Process document
# ---------------------------
name = docai_client.processor_path(PROJECT_ID, LOCATION, PROCESSOR_ID)
raw_document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
request = documentai.ProcessRequest(name=name, raw_document=raw_document)
response = docai_client.process_document(request=request)

document = response.document
full_text = document.text

# ---------------------------
# Extract first table (tb0)
# ---------------------------
if not document.pages or not document.pages[0].tables:
    raise ValueError("No tables found in the document")

table = document.pages[0].tables[0]  # tb0

# Extract header rows
headers = []
for row in table.header_rows:
    headers.append([full_text[int(cell.layout.text_anchor.text_segments[0].start_index):
                              int(cell.layout.text_anchor.text_segments[0].end_index)].strip()
                    for cell in row.cells])

# Extract body rows
body = []
for row in table.body_rows:
    body.append([full_text[int(cell.layout.text_anchor.text_segments[0].start_index):
                           int(cell.layout.text_anchor.text_segments[0].end_index)].strip()
                 for cell in row.cells])

# Combine headers and body
df = pd.DataFrame(body, columns=headers[0] if headers else None)

# ---------------------------
# Save to GCS
# ---------------------------
output_bucket = storage_client.bucket(OUTPUT_BUCKET)
output_blob = output_bucket.blob(OUTPUT_FILENAME)
output_blob.upload_from_string(df.to_csv(index=False), content_type="text/csv")

print(f"Table CSV saved to gs://{OUTPUT_BUCKET}/{OUTPUT_FILENAME}")


Table CSV saved to gs://qwiklabs-gcp-00-67ea0aa744cc-cepf-documentai/sample-form-with-table-tb0.csv
