In [1]:
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai
from google.cloud import storage

import re
import uuid

from extractor import BatchDocumentExtractor
from extractor import OnlineDocumentExtractor

from entity_processor import DocumentAIEntityExtractor, ModelBasedEntityExtractor

from prompts_module import get_extract_entities_prompt,get_compare_entities_prompt
from temp_file_uploader import TempFileUploader
# Batch processing

import vertexai
from vertexai.generative_models import GenerativeModel

In [2]:
project_id = "project-id"
location = "us"  # Or other supported locations like 'eu'
processor_id = "processor-id"
processor_version_id = "processor-version-id"  # Optional for batch processing
# File to process
file_path = "test_file.pdf"
mime_type = "application/pdf"

gcs_output_uri = "gs://bucket-output"  # GCS URI for output
gcs_temp_uri = "gs://bucket-temp"  # GCS URI for output

In [3]:

online_extractor = OnlineDocumentExtractor(
    project_id=project_id,
    location=location,
    processor_id=processor_id,
    # processor_version_id=processor_version_id
)
online_document = online_extractor.process_document(file_path, mime_type)

docai_entity_extractor = DocumentAIEntityExtractor(online_document)
docai_entities = docai_entity_extractor.extract_entities()

In [7]:
docai_entities

{'employees_social_security_number': '399-60-1133',
 'social_security_wages': '247797.13',
 'control_number': '4568872',
 'wages_tips_other_compensation': '210325.45',
 'federal_income_tax_withheld': '62172.84',
 'employer_identification_number': '27-5426623',
 'employers_name_address_and_zip_code': 'Black LLC Ltd\n513 Aaron Bypass Apt. 233\nSherrystad\nMD 02652-4962',
 'social_security_tax_withheld': '18956.48'}

In [5]:
temp_file_uploader= TempFileUploader(gcs_temp_uri)        
gcs_input_uri = temp_file_uploader.upload_file(file_path)    

prompt_extract=get_extract_entities_prompt()
model_extractor = ModelBasedEntityExtractor("gemini-1.5-flash-001", prompt_extract, gcs_input_uri)
gemini_entities = model_extractor.extract_entities()

temp_file_uploader.delete_file()

In [8]:
compare_prompt = get_compare_entities_prompt()
compare_prompt = compare_prompt.format(docai_output=str(docai_entities), gemini_output=str(gemini_entities))
                                                        
model = GenerativeModel("gemini-1.5-flash-001")                                                        
docai_gemini_response_analysis = model.generate_content(compare_prompt)
print(docai_gemini_response_analysis.text)

## DocAI vs. Gemini Output Analysis

Here's a breakdown of the similarities and differences between the DocAI and Gemini outputs:

**Similarities:**

* **Entities with matching values:** 
    * `employees_social_security_number` (DocAI) / `Employee's Social Security Number` (Gemini): '399-60-1133'
    * `employer_identification_number` (DocAI) / `Employer Identification Number (EIN)` (Gemini): '27-5426623'
    * `control_number` (DocAI) / `Control Number` (Gemini): '4568872'
    * `wages_tips_other_compensation` (DocAI) / `Wages, Tips, and Other Compensation (Box 1)` (Gemini): '210325.45'
    * `federal_income_tax_withheld` (DocAI) / `Federal Income Tax Withheld (Box 2)` (Gemini): '62172.84'
    * `social_security_wages` (DocAI) / `Social Security Wages (Box 3)` (Gemini): '247797.13'
    * `social_security_tax_withheld` (DocAI) / `Social Security Tax Withheld (Box 4)` (Gemini): '18956.48'
* **Similar data structure:** Both outputs are dictionaries with key-value pairs.


**Differences: