In [20]:
from typing import Optional,Sequence

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai
from google.cloud import storage


import re
import uuid

In [2]:
from extractor import OnlineDocumentExtractor

In [3]:
project_id = "gsd-ai-mx-ulises"
location = "us"  # Or other supported locations like 'eu'
processor_id = "d2621db9b6cb255b"
processor_version_id = "pretrained-foundation-model-v1.0-2023-08-22"  # Optional for batch processing

# File to process
file_path = "AutoLabel/W2_XL_input_clean_2895.pdf"
mime_type = "application/pdf"

In [None]:
# Online processing
online_extractor = OnlineDocumentExtractor(
    project_id=project_id,
    location=location,
    processor_id=processor_id,
    processor_version_id=processor_version_id
)
online_document = online_extractor.process_document(file_path, mime_type)
print("Online Processed Document:", online_document.entities)

In [69]:
def print_page_dimensions(dimension: documentai.Document.Page.Dimension) -> None:
    print(f"    Width: {str(dimension.width)}")
    print(f"    Height: {str(dimension.height)}")

def print_detected_languages(detected_languages: Sequence[documentai.Document.Page.DetectedLanguage]) -> None:
    print("    Detected languages:")
    for lang in detected_languages:
        print(f"        {lang.language_code} ({lang.confidence:.1%} confidence)")

def print_image_quality_scores(image_quality_scores: documentai.Document.Page.ImageQualityScores) -> None:
    print(f"    Quality score: {image_quality_scores.quality_score:.1%}")
    print("    Detected defects:")
    for detected_defect in image_quality_scores.detected_defects:
        print(f"        {detected_defect.type_}: {detected_defect.confidence:.1%}")

def print_entity(entity: documentai.Document.Entity) -> None:
    key = entity.type_
    text_value = entity.text_anchor.content or entity.mention_text
    confidence = entity.confidence
    normalized_value = entity.normalized_value.text
    print(f"    * Entity: {repr(key)}")
    print(f"    * Raw Value: {repr(text_value)} ({confidence:.1%} confidence)")
    if normalized_value:
        print(f"    * Normalized Value: {repr(normalized_value)}\n")

In [70]:
print(f"There are {len(online_document.pages)} page(s) in this document.\n")

if online_document.pages:
    for page in online_document.pages:
        print(f"Page {page.page_number}:")
        print_page_dimensions(page.dimension)
        print_detected_languages(page.detected_languages)
        if page.image_quality_scores:
                print_image_quality_scores(page.image_quality_scores)
            
if online_document.entities:
    print(f"\nFound {len(online_document.entities)} entities:\n")
    for entity in online_document.entities:
        print_entity(entity)

There are 1 page(s) in this document.

Page 1:
    Width: 1758.0
    Height: 2275.0
    Detected languages:
        en (80.1% confidence)

Found 16 entities:

    * Entity: 'wages_tips_other_compensation'
    * Raw Value: '210325.45' (100.0% confidence)
    * Normalized Value: '210325.45'

    * Entity: 'wages_tips_other_compensation'
    * Raw Value: '210325.45' (100.0% confidence)
    * Normalized Value: '210325.45'

    * Entity: 'employees_social_security_number'
    * Raw Value: '399-60-1133' (100.0% confidence)
    * Entity: 'employees_social_security_number'
    * Raw Value: '399-60-1133' (100.0% confidence)
    * Entity: 'employer_identification_number'
    * Raw Value: '27-5426623' (100.0% confidence)
    * Entity: 'employer_identification_number'
    * Raw Value: '27-5426623' (100.0% confidence)
    * Entity: 'control_number'
    * Raw Value: '4568872' (100.0% confidence)
    * Normalized Value: '4568872'

    * Entity: 'control_number'
    * Raw Value: '4568872' (100.0% conf

In [None]:
prompt = """
You are a very professional document summarization specialist.
Please summarize the given document.
"""

In [65]:
prompt = """
Task: Extract W-2 Tax Form Information

Objective: 
Accurately identify and extract the following fields from the provided document, which is expected to be a W-2 tax form or a representation thereof:

* Employee's Social Security Number
* Employer Identification Number (EIN)
* Employee's Name
* Employer's Name
* Employer's Address
* Control Number (if present)
* Wages, Tips, and Other Compensation (Box 1)
* Federal Income Tax Withheld (Box 2)
* Social Security Wages (Box 3)
* Social Security Tax Withheld (Box 4)
* Medicare Wages and Tips (Box 5)
* Medicare Tax Withheld (Box 6)
* Social Security Tips (Box 7)
* Allocated Tips (Box 8)
* Dependent Care Benefits (Box 10)
* Nonqualified Plan Contributions (Box 11)
* State and Local Information (Boxes 15-20):
    * State
    * Employer's State ID Number
    * State Wages, Tips, Etc.
    * State Income Tax Withheld
    * Local Wages, Tips, Etc.
    * Local Income Tax Withheld
    * Locality Name

Guidelines:

* Prioritize accuracy. If a field cannot be confidently extracted, indicate it as "Not Found" or a similar placeholder.
* Handle variations in document formatting and layout.
* If the document contains multiple W-2 forms, extract information for each one separately.
* Format the extracted data in a structured manner, such as a JSON object or a table, for easy further processing.

Example Output (JSON):

```
{
  "Employee's Social Security Number": "***-**-****",
  "Employer Identification Number (EIN)": "**-*******",
  "Employee's Name": "John Doe",
  "Employer's Name": "Acme Corporation",
  "Employer's Address": "123 Main Street, Anytown, USA",
  "Control Number": "12345",
  "Wages, Tips, and Other Compensation (Box 1)": "50000.00",
  "Federal Income Tax Withheld (Box 2)": "5000.00",
  "Social Security Wages (Box 3)": "45000.00",
  "Social Security Tax Withheld (Box 4)": "2800.00",
  "Medicare Wages and Tips (Box 5)": "50000.00",
  "Medicare Tax Withheld (Box 6)": "725.00",
  "Social Security Tips (Box 7)": "0.00",
  "Allocated Tips (Box 8)": "0.00",
  "Dependent Care Benefits (Box 10)": "0.00",
  "Nonqualified Plan Contributions (Box 11)": "0.00",
  "State": "CA",
  "Employer's State ID Number": "123456789",
  "State Wages, Tips, Etc.": "50000.00",
  "State Income Tax Withheld": "2000.00",
  "Local Wages, Tips, Etc.": "0.00",
  "Local Income Tax Withheld": "0.00",
  "Locality Name": "" 
}
"""

In [66]:
import vertexai

from vertexai.generative_models import GenerativeModel, Part


vertexai.init(project=project_id, location="us-central1")

model = GenerativeModel("gemini-1.5-flash-001")

file_path = "gs://test-gemini-extraction/W2_XL_input_clean_2895.pdf"
pdf_file = Part.from_uri(file_path, mime_type="application/pdf")
contents = [pdf_file, prompt]

response = model.generate_content(contents)
print(response.text)

```json
{
  "Employee's Social Security Number": "399-60-1133",
  "Employer Identification Number (EIN)": "27-5426623",
  "Employee's Name": "Christopher Gonzales",
  "Employer's Name": "Black LLC Ltd",
  "Employer's Address": "513 Aaron Bypass Apt. 233\nSherrystad MD 02652-4962",
  "Control Number": "4568872",
  "Wages, Tips, and Other Compensation (Box 1)": "210325.45",
  "Federal Income Tax Withheld (Box 2)": "62172.84",
  "Social Security Wages (Box 3)": "247797.13",
  "Social Security Tax Withheld (Box 4)": "18956.48",
  "Medicare Wages and Tips (Box 5)": "220635.43",
  "Medicare Tax Withheld (Box 6)": "6398.43",
  "Social Security Tips (Box 7)": "247797.13",
  "Allocated Tips (Box 8)": "220635.43",
  "Dependent Care Benefits (Box 10)": "152",
  "Nonqualified Plan Contributions (Box 11)": "249",
  "State": "KS",
  "Employer's State ID Number": "271-31-788",
  "State Wages, Tips, Etc.": "105342.78",
  "State Income Tax Withheld": "8206.88",
  "Local Wages, Tips, Etc.": "156177.25",