# Textract EDS Basic Processing

This notebook processes PDF documents using AWS Textract WITHOUT a custom adapter. It:
1. Reads the zero-shot results CSV to identify documents with forms
2. Extracts the lowest page number from each document's form_pages
3. Sends individual pages to Textract using standard QUERIES feature
4. Extracts three specific pieces of information from EDS forms


In [34]:
import pandas as pd
import boto3
import os
from pathlib import Path
import json
from typing import List, Dict, Any
import PyPDF2
from io import BytesIO

## Configuration

In [36]:
# Configuration
CLOBBER = False  # Set to True to overwrite existing results, False to skip already processed files

# File paths
CSV_PATH = "../../code/preprocessing/zero_shot_results_full_corpus.csv"
CONTRACTS_DIR = "../../data/raw/_contracts/"
OUTPUT_DIR = "../../data/intermediate_products/eds_forms_textract_textblurbs/"

# AWS Textract configuration
REGION_NAME = "us-east-1"  # Update with your region
CUSTOM_ADAPTER_ID = "4403a7771cbe"  # Update with your custom adapter ID
ADAPTER_VERSION = "2"  # Update with your adapter version

# Feature types - only QUERIES works with custom adapters
FEATURE_TYPES = ['QUERIES']

# Three specific queries for EDS basic processing
EDS_QUERIES = [
    "EDS Number",
    "What text is provided under 'Description of work and justification for spending money'?",
    "What text is provided under 'Justification of vendor selection and determination of price reasonableness'?"
]

# Initialize AWS client
textract_client = boto3.client('textract', region_name=REGION_NAME)

## Load and Filter Data

In [38]:
# Load the CSV file
df = pd.read_csv(CSV_PATH)
print(f"Total documents in CSV: {len(df)}")

# Filter for documents containing forms
forms_df = df[df['contains_form'] == True].copy()
print(f"Documents with forms: {len(forms_df)}")

# Display sample of filtered data
print("\nSample of documents with forms:")
print(forms_df[['filename', 'form_pages', 'num_form_pages']].head())

Total documents in CSV: 42490
Documents with forms: 26109

Sample of documents with forms:
                               filename      form_pages  num_form_pages
804                           0-001.pdf  1,2,4,10,15,16               6
956   0000000000000000000021689-004.pdf               5               1
1660  0000000000000000000025661-003.pdf               5               1
2895  0000000000000000000029856-000.pdf               1               1
3996  0000000000000000000031808-000.pdf               1               1


In [39]:
forms_df = forms_df[:100]

## Helper Functions

In [41]:
def get_lowest_page_number(form_pages_str: str) -> int:
    """
    Extract the lowest page number from the form_pages string.
    
    Args:
        form_pages_str: String containing page numbers (e.g., "1,2,4,10,15,16" or "5")
    
    Returns:
        int: The lowest page number
    """
    if pd.isna(form_pages_str) or form_pages_str == "":
        return None
    
    # Handle both single numbers and comma-separated lists
    if ',' in str(form_pages_str):
        page_numbers = [int(x.strip()) for x in str(form_pages_str).split(',')]
    else:
        page_numbers = [int(str(form_pages_str).strip())]
    
    return min(page_numbers)

def get_output_filename(filename: str, page_number: int) -> str:
    """
    Generate the output JSON filename for a given PDF and page number.
    
    Args:
        filename: Original PDF filename
        page_number: Page number
    
    Returns:
        str: Output JSON filename
    """
    filename_base = Path(filename).stem
    return f"{filename_base}_page_{page_number}_textract_basic.json"

def file_already_processed(filename: str, page_number: int, output_dir: Path) -> bool:
    """
    Check if a file has already been processed.
    
    Args:
        filename: Original PDF filename
        page_number: Page number
        output_dir: Output directory path
    
    Returns:
        bool: True if file exists and has been processed
    """
    json_filename = get_output_filename(filename, page_number)
    json_path = output_dir / json_filename
    return json_path.exists() and json_path.stat().st_size > 0

def extract_single_page_pdf(pdf_path: str, page_number: int) -> bytes:
    """
    Extract a single page from a PDF file.
    
    Args:
        pdf_path: Path to the PDF file
        page_number: Page number to extract (1-indexed)
    
    Returns:
        bytes: PDF content of the single page
    """
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        pdf_writer = PyPDF2.PdfWriter()
        
        # PyPDF2 uses 0-based indexing, so subtract 1
        pdf_writer.add_page(pdf_reader.pages[page_number - 1])
        
        output_buffer = BytesIO()
        pdf_writer.write(output_buffer)
        return output_buffer.getvalue()

def process_with_textract_basic(pdf_bytes: bytes, adapter_id: str, queries: List[str] = None) -> Dict[str, Any]:
    """
    Process a PDF page with Textract using custom adapter and limited queries.
    
    Args:
        pdf_bytes: PDF content as bytes
        adapter_id: Custom adapter ID
        queries: List of queries (uses EDS_QUERIES if not provided)
    
    Returns:
        dict: Textract response
    """
    try:
        # Use provided queries or default EDS queries
        if queries is None:
            queries = EDS_QUERIES
        
        # Prepare the request parameters WITH custom adapter
        request_params = {
            'Document': {'Bytes': pdf_bytes},
            'FeatureTypes': FEATURE_TYPES,
            'QueriesConfig': {
                'Queries': [{'Text': query} for query in queries]
            },
            'AdaptersConfig': {
                'Adapters': [
                    {
                        'AdapterId': adapter_id,
                        'Version': ADAPTER_VERSION
                    }
                ]
            }
        }
        
        response = textract_client.analyze_document(**request_params)
        return response
    except Exception as e:
        print(f"Error processing with Textract: {str(e)}")
        return None

In [50]:
# Add lowest page number to dataframe
forms_df['lowest_page'] = forms_df['form_pages'].apply(get_lowest_page_number)

# Remove rows where we couldn't determine the lowest page
forms_df = forms_df.dropna(subset=['lowest_page'])
forms_df['lowest_page'] = forms_df['lowest_page'].astype(int)

print(f"Documents with valid page numbers: {len(forms_df)}")
print("\nSample with lowest page numbers:")
print(forms_df[['filename', 'form_pages', 'lowest_page']].head(10))

Documents with valid page numbers: 100

Sample with lowest page numbers:
                               filename      form_pages  lowest_page
804                           0-001.pdf  1,2,4,10,15,16            1
956   0000000000000000000021689-004.pdf               5            5
1660  0000000000000000000025661-003.pdf               5            5
2895  0000000000000000000029856-000.pdf               1            1
3996  0000000000000000000031808-000.pdf               1            1
4488  0000000000000000000034656-000.pdf               1            1
5734  0000000000000000000036599-001.pdf               4            4
5749  0000000000000000000036618-001.pdf               4            4
5841  0000000000000000000036347-001.pdf            3,10            3
7128  0000000000000000000038668-000.pdf              14           14


## Process Documents

In [52]:
# Create output directory if it doesn't exist
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)

# Check existing results if CLOBBER is False
if not CLOBBER:
    print("CLOBBER = False: Checking for existing results...")
    already_processed = []
    for idx, row in forms_df.iterrows():
        filename = row['filename']
        lowest_page = row['lowest_page']
        if file_already_processed(filename, lowest_page, output_dir):
            already_processed.append(filename)
    
    if already_processed:
        print(f"Found {len(already_processed)} already processed files. Skipping these.")
        # Filter out already processed files
        forms_df = forms_df[~forms_df['filename'].isin(already_processed)]
        print(f"Remaining files to process: {len(forms_df)}")
    else:
        print("No existing results found. Processing all files.")
else:
    print("CLOBBER = True: Processing all files (will overwrite existing results)")

# Process each document
results = []
errors = []
skipped = []

for idx, row in forms_df.iterrows():
    filename = row['filename']
    lowest_page = row['lowest_page']
    
    pdf_path = os.path.join(CONTRACTS_DIR, filename)
    
    # Check if file exists
    if not os.path.exists(pdf_path):
        error_msg = f"File not found: {filename}"
        print(error_msg)
        errors.append({'filename': filename, 'error': error_msg})
        continue
    
    # Double-check if file already processed (in case of race conditions)
    if not CLOBBER and file_already_processed(filename, lowest_page, output_dir):
        print(f"⏭ Skipping {filename} (already processed)")
        skipped.append({'filename': filename, 'page_number': lowest_page})
        continue
    
    try:
        print(f"Processing {filename}, page {lowest_page}...")
        
        # Extract the specific page
        page_pdf_bytes = extract_single_page_pdf(pdf_path, lowest_page)
        
        # Process with Textract using custom adapter and 3 queries
        textract_response = process_with_textract_basic(page_pdf_bytes, CUSTOM_ADAPTER_ID)
        
        if textract_response:
            # Save result immediately to avoid losing work
            json_filename = get_output_filename(filename, lowest_page)
            json_path = output_dir / json_filename
            
            with open(json_path, 'w') as f:
                json.dump(textract_response, f, indent=2, default=str)
            
            result = {
                'filename': filename,
                'page_number': lowest_page,
                'json_file': json_filename,
                'status': 'success'
            }
            results.append(result)
            print(f"✓ Successfully processed {filename} → {json_filename}")
        else:
            error_msg = f"Textract processing failed for {filename}"
            print(f"✗ {error_msg}")
            errors.append({'filename': filename, 'error': error_msg})
            
    except Exception as e:
        error_msg = f"Error processing {filename}: {str(e)}"
        print(f"✗ {error_msg}")
        errors.append({'filename': filename, 'error': error_msg})

print(f"\nProcessing complete!")
print(f"Successfully processed: {len(results)} documents")
print(f"Skipped (already processed): {len(skipped)} documents")
print(f"Errors: {len(errors)} documents")

CLOBBER = False: Checking for existing results...
No existing results found. Processing all files.
Processing 0-001.pdf, page 1...
✓ Successfully processed 0-001.pdf → 0-001_page_1_textract_basic.json
Processing 0000000000000000000021689-004.pdf, page 5...
✓ Successfully processed 0000000000000000000021689-004.pdf → 0000000000000000000021689-004_page_5_textract_basic.json
Processing 0000000000000000000025661-003.pdf, page 5...
✓ Successfully processed 0000000000000000000025661-003.pdf → 0000000000000000000025661-003_page_5_textract_basic.json
Processing 0000000000000000000029856-000.pdf, page 1...
✓ Successfully processed 0000000000000000000029856-000.pdf → 0000000000000000000029856-000_page_1_textract_basic.json
Processing 0000000000000000000031808-000.pdf, page 1...
✓ Successfully processed 0000000000000000000031808-000.pdf → 0000000000000000000031808-000_page_1_textract_basic.json
Processing 0000000000000000000034656-000.pdf, page 1...
✓ Successfully processed 0000000000000000000034

KeyboardInterrupt: 

In [None]:
# Save summary and error files
if results or skipped:
    # Count existing files if CLOBBER was False
    existing_files = []
    if not CLOBBER:
        for json_file in output_dir.glob("*.json"):
            if json_file.name != "processing_summary.json":
                existing_files.append(json_file.name)
    
    # Save summary of all results (including existing ones)
    summary = {
        'processing_type': 'custom_adapter_limited_queries',
        'queries_used': EDS_QUERIES,
        'clobber_mode': CLOBBER,
        'newly_processed_count': len(results),
        'skipped_count': len(skipped),
        'error_count': len(errors),
        'total_existing_files': len(existing_files) if not CLOBBER else 0,
        'newly_processed_files': [{
            'filename': r['filename'],
            'page_number': r['page_number'],
            'json_file': r['json_file']
        } for r in results],
        'skipped_files': skipped
    }
    
    with open(output_dir / 'processing_summary.json', 'w') as f:
        json.dump(summary, f, indent=2)

# Save errors if any
if errors:
    errors_df = pd.DataFrame(errors)
    errors_df.to_csv(output_dir / 'processing_errors.csv', index=False)
    print(f"Errors saved to: {output_dir / 'processing_errors.csv'}")

print(f"Results saved to: {output_dir}")
print(f"Newly processed JSON files: {len(results)}")
if not CLOBBER and skipped:
    print(f"Skipped existing files: {len(skipped)}")
print(f"Summary file: processing_summary.json")

In [None]:
# Display summary statistics
total_processed = len(results) + len(skipped)
if total_processed > 0:
    print("Processing Summary:")
    print(f"- Processing type: Custom adapter with limited queries")
    print(f"- Queries used: {len(EDS_QUERIES)}")
    for i, query in enumerate(EDS_QUERIES, 1):
        print(f"  {i}. {query}")
    print(f"- Total documents in scope: {total_processed}")
    print(f"- Newly processed: {len(results)}")
    if not CLOBBER and skipped:
        print(f"- Skipped (already processed): {len(skipped)}")
    print(f"- Total errors: {len(errors)}")
    if len(results) + len(errors) > 0:
        print(f"- Success rate: {len(results)/(len(results)+len(errors))*100:.1f}%")
    
    # Show sample of extracted answers from first result
    if len(results) > 0:
        sample_result = results[0]
        print(f"\nSample extraction from {sample_result['filename']} (page {sample_result['page_number']}):"))
        
        # Load the JSON file to get Textract response
        json_path = output_dir / sample_result['json_file']
        with open(json_path, 'r') as f:
            textract_response = json.load(f)
        
        # Extract query results from Textract response
        textract_blocks = textract_response.get('Blocks', [])
        query_blocks = [block for block in textract_blocks if block['BlockType'] == 'QUERY']
        
        print("Query responses:")
        for query_block in query_blocks:
            query_text = query_block.get('Query', {}).get('Text', 'Unknown query')
            
            # Find the corresponding answer block
            answer_text = "No answer found"
            if 'Relationships' in query_block:
                for relationship in query_block['Relationships']:
                    if relationship['Type'] == 'ANSWER':
                        answer_ids = relationship['Ids']
                        for answer_id in answer_ids:
                            answer_block = next((b for b in textract_blocks if b['Id'] == answer_id), None)
                            if answer_block and 'Text' in answer_block:
                                answer_text = answer_block['Text']
                                break
            
            print(f"Q: {query_text}")
            print(f"A: {answer_text}")
            print()
else:
    print("No files were processed.")

In [None]:
# Save summary and error files
if results or skipped:
    # Count existing files if CLOBBER was False
    existing_files = []
    if not CLOBBER:
        for json_file in output_dir.glob("*.json"):
            if json_file.name != "processing_summary.json":
                existing_files.append(json_file.name)
    
    # Save summary of all results (including existing ones)
    summary = {
        'processing_type': 'textract_eds_adapter_textblurbs',
        'queries_used': EDS_QUERIES,
        'clobber_mode': CLOBBER,
        'newly_processed_count': len(results),
        'skipped_count': len(skipped),
        'error_count': len(errors),
        'total_existing_files': len(existing_files) if not CLOBBER else 0,
        'newly_processed_files': [{
            'filename': r['filename'],
            'page_number': r['page_number'],
            'json_file': r['json_file']
        } for r in results],
        'skipped_files': skipped
    }
    
    with open(output_dir / 'processing_summary.json', 'w') as f:
        json.dump(summary, f, indent=2)

# Save errors if any
if errors:
    errors_df = pd.DataFrame(errors)
    errors_df.to_csv(output_dir / 'processing_errors.csv', index=False)
    print(f"Errors saved to: {output_dir / 'processing_errors.csv'}")

print(f"Results saved to: {output_dir}")
print(f"Newly processed JSON files: {len(results)}")
if not CLOBBER and skipped:
    print(f"Skipped existing files: {len(skipped)}")
print(f"Summary file: processing_summary.json")

## Display Sample Results

In [None]:
# Display summary statistics
total_processed = len(results) + len(skipped)
if total_processed > 0:
    print("Processing Summary:")
    print(f"- Processing type: Basic Textract (no custom adapter)")
    print(f"- Queries used: {len(EDS_QUERIES)}")
    for i, query in enumerate(EDS_QUERIES, 1):
        print(f"  {i}. {query}")
    print(f"- Total documents in scope: {total_processed}")
    print(f"- Newly processed: {len(results)}")
    if not CLOBBER and skipped:
        print(f"- Skipped (already processed): {len(skipped)}")
    print(f"- Total errors: {len(errors)}")
    if len(results) + len(errors) > 0:
        print(f"- Success rate: {len(results)/(len(results)+len(errors))*100:.1f}%")
    
    # Show sample of extracted answers from first result
    if len(results) > 0:
        sample_result = results[0]
        print(f"\nSample extraction from {sample_result['filename']} (page {sample_result['page_number']}):")
        
        # Load the JSON file to get Textract response
        json_path = output_dir / sample_result['json_file']
        with open(json_path, 'r') as f:
            textract_response = json.load(f)
        
        # Extract query results from Textract response
        textract_blocks = textract_response.get('Blocks', [])
        query_blocks = [block for block in textract_blocks if block['BlockType'] == 'QUERY']
        
        print("Query responses:")
        for query_block in query_blocks:
            query_text = query_block.get('Query', {}).get('Text', 'Unknown query')
            
            # Find the corresponding answer block
            answer_text = "No answer found"
            if 'Relationships' in query_block:
                for relationship in query_block['Relationships']:
                    if relationship['Type'] == 'ANSWER':
                        answer_ids = relationship['Ids']
                        for answer_id in answer_ids:
                            answer_block = next((b for b in textract_blocks if b['Id'] == answer_id), None)
                            if answer_block and 'Text' in answer_block:
                                answer_text = answer_block['Text']
                                break
            
            print(f"Q: {query_text}")
            print(f"A: {answer_text}")
            print()
else:
    print("No files were processed.")

## Error Analysis

In [None]:
if errors:
    print("Error Analysis:")
    errors_df = pd.DataFrame(errors)
    print(errors_df.head(10))
    
    # Count error types
    error_types = errors_df['error'].value_counts()
    print("\nError type distribution:")
    print(error_types)
else:
    print("No errors to analyze!")