# Google Document AI PDF Parser

This notebook demonstrates how to use Google Cloud Document AI to parse PDF documents, extract text, tables, form fields, and entities with cost estimation.


In [1]:
import os
import json
from typing import Optional, List, Dict
import pandas as pd
from google.cloud import documentai_v1 as documentai
from google.oauth2 import service_account
import time
from pathlib import Path


## Document AI Parser Class with Cost Estimation


In [2]:
class DocumentAIParser:
    def __init__(
        self,
        project_id: str,
        location: str,
        processor_id: str,
        credentials_path: str
    ):
        """Initialize Document AI client with credentials."""
        # Set up credentials
        credentials = service_account.Credentials.from_service_account_file(
            credentials_path
        )
        
        # Initialize client
        self.client = documentai.DocumentProcessorServiceClient(
            credentials=credentials
        )
        
        # Set processor path
        self.processor_name = self.client.processor_path(
            project_id, location, processor_id
        )
        
        self.project_id = project_id
        self.location = location
        
        # Cost tracking
        self.cost_per_page = 0.0015  # $0.0015 per page (as of 2024)
        self.cost_per_1000_chars = 0.0001  # $0.0001 per 1000 characters
        self.total_cost = 0.0
        self.processing_stats = {
            'pages_processed': 0,
            'characters_processed': 0,
            'tables_extracted': 0,
            'entities_extracted': 0,
            'form_fields_extracted': 0
        }
        
    def parse_pdf(self, file_path: str) -> documentai.Document:
        """Parse a PDF file using Document AI with cost tracking."""
        start_time = time.time()
        
        # Read file and get size info
        file_size = os.path.getsize(file_path)
        print(f"Processing file: {file_path} ({file_size:,} bytes)")
        
        with open(file_path, 'rb') as file:
            file_content = file.read()
        
        # Configure the process request
        document = documentai.RawDocument(
            content=file_content,
            mime_type='application/pdf'
        )
        
        request = documentai.ProcessRequest(
            name=self.processor_name,
            raw_document=document
        )
        
        # Process the document
        print("Sending request to Document AI...")
        result = self.client.process_document(request=request)
        
        # Calculate costs
        processing_time = time.time() - start_time
        pages = len(result.document.pages)
        characters = len(result.document.text)
        
        # Update stats
        self.processing_stats['pages_processed'] += pages
        self.processing_stats['characters_processed'] += characters
        
        # Calculate cost
        page_cost = pages * self.cost_per_page
        char_cost = (characters / 1000) * self.cost_per_1000_chars
        total_doc_cost = page_cost + char_cost
        self.total_cost += total_doc_cost
        
        print(f"\nProcessing completed in {processing_time:.2f} seconds")
        print(f"Pages processed: {pages}")
        print(f"Characters extracted: {characters:,}")
        print(f"Estimated cost: ${total_doc_cost:.4f}")
        
        return result.document
    
    def extract_text(self, document: documentai.Document) -> str:
        """Extract all text from parsed document."""
        return document.text
    
    def extract_tables(self, document: documentai.Document) -> List[pd.DataFrame]:
        """Extract tables as pandas DataFrames."""
        tables = []
        
        for page in document.pages:
            for table in page.tables:
                header_rows = []
                body_rows = []
                
                for row_idx, row in enumerate(table.header_rows):
                    header_row = []
                    for cell in row.cells:
                        cell_text = self._get_text_from_layout(
                            cell.layout, document.text
                        )
                        header_row.append(cell_text)
                    header_rows.append(header_row)
                
                for row_idx, row in enumerate(table.body_rows):
                    body_row = []
                    for cell in row.cells:
                        cell_text = self._get_text_from_layout(
                            cell.layout, document.text
                        )
                        body_row.append(cell_text)
                    body_rows.append(body_row)
                
                # Create DataFrame
                if header_rows and body_rows:
                    df = pd.DataFrame(body_rows, columns=header_rows[0])
                    tables.append(df)
                elif body_rows:
                    df = pd.DataFrame(body_rows)
                    tables.append(df)
        
        self.processing_stats['tables_extracted'] += len(tables)
        return tables
    
    def extract_form_fields(self, document: documentai.Document) -> Dict[str, str]:
        """Extract form fields as key-value pairs."""
        fields = {}
        
        for page in document.pages:
            for field in page.form_fields:
                field_name = self._get_text_from_layout(
                    field.field_name, document.text
                )
                field_value = self._get_text_from_layout(
                    field.field_value, document.text
                )
                fields[field_name] = field_value
        
        self.processing_stats['form_fields_extracted'] += len(fields)
        return fields
    
    def extract_entities(self, document: documentai.Document) -> List[Dict]:
        """Extract entities identified by Document AI."""
        entities = []
        
        for entity in document.entities:
            entities.append({
                'type': entity.type_,
                'text': entity.mention_text,
                'confidence': entity.confidence
            })
        
        self.processing_stats['entities_extracted'] += len(entities)
        return entities
    
    def _get_text_from_layout(
        self, 
        layout: documentai.Document.Page.Layout,
        text: str
    ) -> str:
        """Extract text from layout element."""
        response = ""
        
        for segment in layout.text_anchor.text_segments:
            start_index = int(segment.start_index) if segment.start_index else 0
            end_index = int(segment.end_index)
            response += text[start_index:end_index]
        
        return response.strip()
    
    def get_cost_summary(self) -> Dict:
        """Get detailed cost and processing summary."""
        return {
            'total_cost_usd': round(self.total_cost, 4),
            'processing_stats': self.processing_stats.copy(),
            'cost_breakdown': {
                'pages_cost': round(self.processing_stats['pages_processed'] * self.cost_per_page, 4),
                'characters_cost': round((self.processing_stats['characters_processed'] / 1000) * self.cost_per_1000_chars, 4)
            }
        }
    
    def reset_cost_tracking(self):
        """Reset cost tracking for new session."""
        self.total_cost = 0.0
        self.processing_stats = {
            'pages_processed': 0,
            'characters_processed': 0,
            'tables_extracted': 0,
            'entities_extracted': 0,
            'form_fields_extracted': 0
        }


## Configuration

**IMPORTANT**: Update these configuration values with your actual Google Cloud credentials and settings.


In [16]:
# Configuration - UPDATE THESE VALUES
PROJECT_ID = "marine-actor-473300-h8"  # Your Google Cloud Project ID
LOCATION = "us"  # or "eu" - your processor location
PROCESSOR_ID = "2258be222035edb3"  # Your Document AI processor ID
CREDENTIALS_PATH = "/Users/divyanshmac/Documents/Google Cloud/credentials.json"  # Path to your service account key file

# PDF file to process - UPDATE THIS PATH
PDF_PATH = "../data/GCP_EXTRACTION.pdf"  # Path to the PDF you want to parse

print("Configuration:")
print(f"Project ID: {PROJECT_ID}")
print(f"Location: {LOCATION}")
print(f"Processor ID: {PROCESSOR_ID}")
print(f"Credentials: {CREDENTIALS_PATH}")
print(f"PDF Path: {PDF_PATH}")


Configuration:
Project ID: marine-actor-473300-h8
Location: us
Processor ID: 2258be222035edb3
Credentials: /Users/divyanshmac/Documents/Google Cloud/credentials.json
PDF Path: ../data/GCP_EXTRACTION.pdf


## Initialize Parser


In [17]:
# Initialize parser
try:
    parser = DocumentAIParser(
        project_id=PROJECT_ID,
        location=LOCATION,
        processor_id=PROCESSOR_ID,
        credentials_path=CREDENTIALS_PATH
    )
    print("✅ Document AI Parser initialized successfully!")
except Exception as e:
    print(f"❌ Error initializing parser: {e}")
    print("Please check your configuration values and credentials.")


✅ Document AI Parser initialized successfully!


E0000 00:00:1758851588.521690 20886014 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


## Parse PDF Document


In [18]:
# Parse PDF
try:
    print("Starting PDF parsing...")
    document = parser.parse_pdf(PDF_PATH)
    print("✅ PDF parsed successfully!")
except Exception as e:
    print(f"❌ Error parsing PDF: {e}")
    print("Please check your PDF path and processor configuration.")


Starting PDF parsing...
Processing file: ../data/GCP_EXTRACTION.pdf (308,151 bytes)
Sending request to Document AI...

Processing completed in 5.90 seconds
Pages processed: 5
Characters extracted: 12,824
Estimated cost: $0.0088
✅ PDF parsed successfully!


## Extract Text Content


In [19]:
# Extract text
if 'document' in locals():
    text = parser.extract_text(document)
    print(f"Extracted {len(text):,} characters of text")
    print(f"First 500 characters:\n{text[:500]}...")
    
    # Save text to file
    output_dir = Path("../data/parsed/documentai_output")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    with open(output_dir / "extracted_text.txt", "w", encoding="utf-8") as f:
        f.write(text)
    print(f"✅ Text saved to {output_dir / 'extracted_text.txt'}")
else:
    print("❌ No document available. Please run the parsing cell first.")


Extracted 12,824 characters of text
First 500 characters:
PARTI
Item 1B, 1C
ITEM 1B. UNRESOLVED STAFF COMMENTS
We have received no written comments regarding our periodic or current reports from the staff of the Securities and
Exchange Commission that were issued 180 days or more preceding the end of our fiscal year 2024 that remain
unresolved.
ITEM 1C. CYBERSECURITY
RISK MANAGEMENT AND STRATEGY
Microsoft plays a central role in the world's digital ecosystem. We have made it the top corporate priority to protect the
computing environment used by our cu...
✅ Text saved to ../data/parsed/documentai_output/extracted_text.txt


## Extract Tables


In [20]:
# Extract tables
if 'document' in locals():
    tables = parser.extract_tables(document)
    print(f"Found {len(tables)} tables")
    
    if tables:
        # Save tables to CSV files
        output_dir = Path("../data/parsed/documentai_output")
        
        for i, table in enumerate(tables):
            print(f"\nTable {i+1}:")
            print(f"Shape: {table.shape}")
            print(table.head())
            
            # Save to CSV
            csv_path = output_dir / f"table_{i+1}.csv"
            table.to_csv(csv_path, index=False)
            print(f"✅ Saved to {csv_path}")
    else:
        print("No tables found in the document.")
else:
    print("❌ No document available. Please run the parsing cell first.")


Found 3 tables

Table 1:
Shape: (2, 1)
                      Location\nOwned\nLeased\nTotal
0  U.S.\n30\n20\n50\nFi\n30\nInternational\n20\n1...
1                                  Total\n40\n45\n85
✅ Saved to ../data/parsed/documentai_output/table_1.csv

Table 2:
Shape: (2, 5)
  Declaration Date      Record Date        Payment Date Dividend\nPer Share  \
0                                                                             
1    June 12, 2024  August 15, 2024  September 12, 2024                0.75   

          Amount  
0  (In millions)  
1       $\n5,575  
✅ Saved to ../data/parsed/documentai_output/table_2.csv

Table 3:
Shape: (5, 5)
                           Period Total Number\nof Shares\nPurchased  \
0                                                                      
1  April 1, 2024 - April 30, 2024                          2,444,905   
2      May 1, 2024 - May 31, 2024                          2,233,450   
3    June 1, 2024 - June 30, 2024                          

## Extract Form Fields


In [21]:
# Extract form fields
if 'document' in locals():
    fields = parser.extract_form_fields(document)
    
    if fields:
        print(f"Found {len(fields)} form fields:")
        for key, value in fields.items():
            print(f"  {key}: {value}")
        
        # Save form fields to JSON
        output_dir = Path("../data/parsed/documentai_output")
        with open(output_dir / "form_fields.json", "w") as f:
            json.dump(fields, f, indent=2)
        print(f"\n✅ Form fields saved to {output_dir / 'form_fields.json'}")
    else:
        print("No form fields found in the document.")
else:
    print("❌ No document available. Please run the parsing cell first.")


Found 5 form fields:
  Secure by Design:: Security comes first when designing any product or service.
  Declaration Date: June 12, 2024
  Dividend
Per Share: Amount
  Record Date: Declaration Date
  (In millions): 5,575

✅ Form fields saved to ../data/parsed/documentai_output/form_fields.json


## Extract Entities


In [22]:
# Extract entities
if 'document' in locals():
    entities = parser.extract_entities(document)
    
    if entities:
        print(f"Found {len(entities)} entities:")
        
        # Group entities by type
        entity_types = {}
        for entity in entities:
            entity_type = entity['type']
            if entity_type not in entity_types:
                entity_types[entity_type] = []
            entity_types[entity_type].append(entity)
        
        for entity_type, type_entities in entity_types.items():
            print(f"\n{entity_type} ({len(type_entities)} found):")
            for entity in type_entities[:5]:  # Show first 5 of each type
                print(f"  - {entity['text']} (confidence: {entity['confidence']:.2f})")
            if len(type_entities) > 5:
                print(f"  ... and {len(type_entities) - 5} more")
        
        # Save entities to JSON
        output_dir = Path("../data/parsed/documentai_output")
        with open(output_dir / "entities.json", "w") as f:
            json.dump(entities, f, indent=2)
        print(f"\n✅ Entities saved to {output_dir / 'entities.json'}")
    else:
        print("No entities found in the document.")
else:
    print("❌ No document available. Please run the parsing cell first.")


Found 4 entities:

generic_entities (4 found):
  -  (confidence: 0.00)
  -  (confidence: 0.00)
  -  (confidence: 0.00)
  -  (confidence: 0.00)

✅ Entities saved to ../data/parsed/documentai_output/entities.json


## Cost Summary


In [23]:
# Get cost summary
cost_summary = parser.get_cost_summary()

print("📊 Processing Summary:")
print(f"Total Cost: ${cost_summary['total_cost_usd']}")
print(f"Pages Processed: {cost_summary['processing_stats']['pages_processed']}")
print(f"Characters Processed: {cost_summary['processing_stats']['characters_processed']:,}")
print(f"Tables Extracted: {cost_summary['processing_stats']['tables_extracted']}")
print(f"Form Fields Extracted: {cost_summary['processing_stats']['form_fields_extracted']}")
print(f"Entities Extracted: {cost_summary['processing_stats']['entities_extracted']}")

print("\n💰 Cost Breakdown:")
print(f"Pages Cost: ${cost_summary['cost_breakdown']['pages_cost']}")
print(f"Characters Cost: ${cost_summary['cost_breakdown']['characters_cost']}")

# Save cost summary
output_dir = Path("../data/parsed/documentai_output")
with open(output_dir / "cost_summary.json", "w") as f:
    json.dump(cost_summary, f, indent=2)
print(f"\n✅ Cost summary saved to {output_dir / 'cost_summary.json'}")


📊 Processing Summary:
Total Cost: $0.0088
Pages Processed: 5
Characters Processed: 12,824
Tables Extracted: 3
Form Fields Extracted: 5
Entities Extracted: 4

💰 Cost Breakdown:
Pages Cost: $0.0075
Characters Cost: $0.0013

✅ Cost summary saved to ../data/parsed/documentai_output/cost_summary.json


## Reset for New Document

Run this cell to reset cost tracking before processing a new document.


In [None]:
# Reset cost tracking
parser.reset_cost_tracking()
print("✅ Cost tracking reset. Ready for new document processing.")


## Notes

### Setup Requirements:
1. **Google Cloud Project**: Create a project in Google Cloud Console
2. **Enable Document AI API**: Enable the Document AI API in your project
3. **Create Processor**: Create a Document AI processor (Form Parser, OCR, etc.)
4. **Service Account**: Create a service account and download the JSON key file
5. **Update Configuration**: Update the configuration cell with your actual values

### Cost Information:
- **Per Page**: $0.0015 USD
- **Per 1000 Characters**: $0.0001 USD
- Costs are estimated based on current Google Cloud pricing (2024)

### Output Files:
All extracted data is saved to `../data/parsed/documentai_output/`:
- `extracted_text.txt`: Full text content
- `table_*.csv`: Extracted tables as CSV files
- `form_fields.json`: Form fields as key-value pairs
- `entities.json`: Extracted entities with confidence scores
- `cost_summary.json`: Processing statistics and cost breakdown
