# Textract EDS Adapter

This notebook processes PDF documents using AWS Textract with a custom adapter. It:
1. Reads the zero-shot results CSV to identify documents with forms
2. Extracts the lowest page number from each document's form_pages
3. Sends individual pages to Textract using the custom adapter


In [None]:
import pandas as pd
import boto3
import os
from pathlib import Path
import json
from typing import List, Dict, Any
import PyPDF2
from io import BytesIO
import base64

## Configuration

In [None]:
# File paths
CSV_PATH = "../../code/preprocessing/zero_shot_results_full_corpus.csv"
CONTRACTS_DIR = "../../data/raw/_contracts/"

# AWS Textract configuration
REGION_NAME = "us-east-1"  # Update with your region
CUSTOM_ADAPTER_ID = "your-custom-adapter-id"  # Update with your custom adapter ID

# Initialize AWS client
textract_client = boto3.client('textract', region_name=REGION_NAME)

## Load and Filter Data

In [None]:
# Load the CSV file
df = pd.read_csv(CSV_PATH)
print(f"Total documents in CSV: {len(df)}")

# Filter for documents containing forms
forms_df = df[df['contains_form'] == True].copy()
print(f"Documents with forms: {len(forms_df)}")

# Display sample of filtered data
print("\nSample of documents with forms:")
print(forms_df[['filename', 'form_pages', 'num_form_pages']].head())

## Helper Functions

In [None]:
def get_lowest_page_number(form_pages_str: str) -> int:
    """
    Extract the lowest page number from the form_pages string.
    
    Args:
        form_pages_str: String containing page numbers (e.g., "1,2,4,10,15,16" or "5")
    
    Returns:
        int: The lowest page number
    """
    if pd.isna(form_pages_str) or form_pages_str == "":
        return None
    
    # Handle both single numbers and comma-separated lists
    if ',' in str(form_pages_str):
        page_numbers = [int(x.strip()) for x in str(form_pages_str).split(',')]
    else:
        page_numbers = [int(str(form_pages_str).strip())]
    
    return min(page_numbers)

def extract_single_page_pdf(pdf_path: str, page_number: int) -> bytes:
    """
    Extract a single page from a PDF file.
    
    Args:
        pdf_path: Path to the PDF file
        page_number: Page number to extract (1-indexed)
    
    Returns:
        bytes: PDF content of the single page
    """
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        pdf_writer = PyPDF2.PdfWriter()
        
        # PyPDF2 uses 0-based indexing, so subtract 1
        pdf_writer.add_page(pdf_reader.pages[page_number - 1])
        
        output_buffer = BytesIO()
        pdf_writer.write(output_buffer)
        return output_buffer.getvalue()

def process_with_textract(pdf_bytes: bytes, adapter_id: str) -> Dict[str, Any]:
    """
    Process a PDF page with Textract using a custom adapter.
    
    Args:
        pdf_bytes: PDF content as bytes
        adapter_id: Custom adapter ID
    
    Returns:
        dict: Textract response
    """
    try:
        response = textract_client.analyze_document(
            Document={'Bytes': pdf_bytes},
            FeatureTypes=['FORMS', 'TABLES'],
            AdaptersConfig={
                'Adapters': [
                    {
                        'AdapterId': adapter_id,
                        'Version': '1'  # Adjust version as needed
                    }
                ]
            }
        )
        return response
    except Exception as e:
        print(f"Error processing with Textract: {str(e)}")
        return None

## Process Documents

In [None]:
# Add lowest page number to dataframe
forms_df['lowest_page'] = forms_df['form_pages'].apply(get_lowest_page_number)

# Remove rows where we couldn't determine the lowest page
forms_df = forms_df.dropna(subset=['lowest_page'])
forms_df['lowest_page'] = forms_df['lowest_page'].astype(int)

print(f"Documents with valid page numbers: {len(forms_df)}")
print("\nSample with lowest page numbers:")
print(forms_df[['filename', 'form_pages', 'lowest_page']].head(10))

In [None]:
# Process each document
results = []
errors = []

for idx, row in forms_df.iterrows():
    filename = row['filename']
    lowest_page = row['lowest_page']
    
    pdf_path = os.path.join(CONTRACTS_DIR, filename)
    
    # Check if file exists
    if not os.path.exists(pdf_path):
        error_msg = f"File not found: {filename}"
        print(error_msg)
        errors.append({'filename': filename, 'error': error_msg})
        continue
    
    try:
        print(f"Processing {filename}, page {lowest_page}...")
        
        # Extract the specific page
        page_pdf_bytes = extract_single_page_pdf(pdf_path, lowest_page)
        
        # Process with Textract
        textract_response = process_with_textract(page_pdf_bytes, CUSTOM_ADAPTER_ID)
        
        if textract_response:
            result = {
                'filename': filename,
                'page_number': lowest_page,
                'textract_response': textract_response,
                'status': 'success'
            }
            results.append(result)
            print(f"✓ Successfully processed {filename}")
        else:
            error_msg = f"Textract processing failed for {filename}"
            print(f"✗ {error_msg}")
            errors.append({'filename': filename, 'error': error_msg})
            
    except Exception as e:
        error_msg = f"Error processing {filename}: {str(e)}"
        print(f"✗ {error_msg}")
        errors.append({'filename': filename, 'error': error_msg})

print(f"\nProcessing complete!")
print(f"Successfully processed: {len(results)} documents")
print(f"Errors: {len(errors)} documents")

## Save Results

In [None]:
# Create output directory if it doesn't exist
output_dir = Path("textract_results")
output_dir.mkdir(exist_ok=True)

# Save successful results
if results:
    # Save individual JSON files for each result
    for result in results:
        filename_base = Path(result['filename']).stem
        json_filename = f"{filename_base}_page_{result['page_number']}_textract.json"
        json_path = output_dir / json_filename
        
        with open(json_path, 'w') as f:
            json.dump(result['textract_response'], f, indent=2, default=str)
    
    # Save summary of all results
    summary = {
        'processed_count': len(results),
        'error_count': len(errors),
        'processed_files': [{
            'filename': r['filename'],
            'page_number': r['page_number'],
            'json_file': f"{Path(r['filename']).stem}_page_{r['page_number']}_textract.json"
        } for r in results]
    }
    
    with open(output_dir / 'processing_summary.json', 'w') as f:
        json.dump(summary, f, indent=2)

# Save errors if any
if errors:
    errors_df = pd.DataFrame(errors)
    errors_df.to_csv(output_dir / 'processing_errors.csv', index=False)
    print(f"Errors saved to: {output_dir / 'processing_errors.csv'}")

print(f"Results saved to: {output_dir}")
print(f"Individual JSON files: {len(results)}")
print(f"Summary file: processing_summary.json")

## Display Sample Results

In [None]:
# Display summary statistics
if results:
    print("Processing Summary:")
    print(f"- Total documents processed: {len(results)}")
    print(f"- Total errors: {len(errors)}")
    print(f"- Success rate: {len(results)/(len(results)+len(errors))*100:.1f}%")
    
    # Show sample of extracted text from first result
    if len(results) > 0:
        sample_result = results[0]
        print(f"\nSample extraction from {sample_result['filename']} (page {sample_result['page_number']}):")
        
        # Extract text blocks from Textract response
        textract_blocks = sample_result['textract_response'].get('Blocks', [])
        text_blocks = [block['Text'] for block in textract_blocks if block['BlockType'] == 'LINE']
        
        print("First 10 lines of detected text:")
        for i, text in enumerate(text_blocks[:10]):
            print(f"{i+1:2d}: {text}")
            
        # Show key-value pairs if detected
        key_value_blocks = [block for block in textract_blocks if block['BlockType'] == 'KEY_VALUE_SET']
        if key_value_blocks:
            print(f"\nDetected {len(key_value_blocks)} key-value pairs in the form.")
else:
    print("No results to display.")

## Error Analysis

In [None]:
if errors:
    print("Error Analysis:")
    errors_df = pd.DataFrame(errors)
    print(errors_df.head(10))
    
    # Count error types
    error_types = errors_df['error'].value_counts()
    print("\nError type distribution:")
    print(error_types)
else:
    print("No errors to analyze!")