In [39]:
# Install required packages
#!pip install pdf2image requests openai pydantic

# Configuration options
CLOBBER = False  # Set to True to overwrite existing results
TEST = False     # Set to True to run test with limited documents
TEST_DOCS = 1  


import os
import re
import json
import base64
import requests
import time
from pdf2image import convert_from_path
from typing import Dict, Optional, List

def extract_page_from_pdf(pdf_path: str, page_number: int) -> Optional[bytes]:
    """Extract a specific page from PDF and convert to image"""
    try:
        images = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
        if images:
            # Save to bytes
            from io import BytesIO
            img_byte_arr = BytesIO()
            images[0].save(img_byte_arr, format='JPEG')
            return img_byte_arr.getvalue()
        return None
    except Exception as e:
        print(f"Error extracting page {page_number} from {pdf_path}: {e}")
        return None

def encode_image(image_bytes: bytes) -> str:
    """Encode image bytes to base64 string"""
    return base64.b64encode(image_bytes).decode('utf-8')

def process_page_with_openai(image_bytes: bytes, api_key: str) -> Dict:
    """Send image to OpenAI API for processing"""
    base64_image = encode_image(image_bytes)
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }



    prompt = """Please analyze this front page of the document and extract the following information:
            - title
            - date
            - id_number (formatted similarly to `EPA-452/R-03-10`)
            For any of these items that you cannot identify in the image, please return 'NA'. Provide the results as a json file."""

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 1000
    }
    
    try:
        response = requests.post(
            "https://api.openai.com/v1/chat/completions",
            headers=headers,
            json=payload
        )
        
        # Check if the request was successful
        response.raise_for_status()
        
        # Get the JSON response
        response_data = response.json()
        
        # Check if we have the expected structure
        if 'choices' not in response_data:
            print(f"Unexpected API response structure: {response_data}")
            if 'error' in response_data:
                print(f"API Error: {response_data['error']}")
            raise ValueError(f"API response missing 'choices': {response_data}")
            
        if not response_data['choices']:
            raise ValueError("API returned empty choices array")
            
        return response_data
        
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        if hasattr(e.response, 'text'):
            print(f"Response text: {e.response.text}")
        raise
    except json.JSONDecodeError as e:
        print(f"Failed to parse API response: {e}")
        print(f"Response text: {response.text}")
        raise
    except Exception as e:
        print(f"Unexpected error: {e}")
        raise


def process_document(pdf_path: str, api_key: str, max_retries: int = 3) -> Dict:
    """Process a document by checking pages until all information is found or max pages reached"""
    results = {
        "title": "NA",
        "date": "NA",
        "id_number": "NA"
    }
    
    # Process first page
    page_image = extract_page_from_pdf(pdf_path, 1)
    if page_image:
        for attempt in range(max_retries):
            try:
                response = process_page_with_openai(page_image, api_key)
                content = response['choices'][0]['message']['content']
                
                try:
                    page_results = json.loads(content)
                except json.JSONDecodeError:
                    print(f"Failed to parse JSON from API response: {content}")
                    # Try to extract information using regex if JSON parsing fails
                    page_results = {}
                    patterns = {
                        'title': r'"title":\s*"([^"]*)",?',
                        'date': r'"date":\s*"([^"]*)",?',
                        'id_number': r'"id_number":\s*"([^"]*)",?'
                    }
                    for key, pattern in patterns.items():
                        match = re.search(pattern, content)
                        if match:
                            page_results[key] = match.group(1)
                
                # Update results from first page
                for key in results:
                    if page_results.get(key, "NA") != "NA":
                        results[key] = page_results[key]
                
                break  # Break retry loop if successful
                
            except Exception as e:
                print(f"Error on attempt {attempt + 1} for page 1 of {pdf_path}: {e}")
                if attempt == max_retries - 1:
                    print(f"Failed all {max_retries} attempts for page 1")
                else:
                    time.sleep(2 ** attempt)
                    continue
    
    # Check if we need to process more pages
    na_items = [key for key, value in results.items() if value == "NA"]
    if na_items:
        print(f"Still looking for: {', '.join(na_items)}")
        
        # Process subsequent pages only if we have NA items
        for page_num in range(2, 5):  # Check pages 2-4
            if not na_items:  # Stop if we've found everything
                break
                
            print(f"Checking page {page_num} for: {', '.join(na_items)}")
            page_image = extract_page_from_pdf(pdf_path, page_num)
            if not page_image:
                continue
                
            for attempt in range(max_retries):
                try:
                    response = process_page_with_openai(page_image, api_key)
                    content = response['choices'][0]['message']['content']
                    
                    try:
                        page_results = json.loads(content)
                    except json.JSONDecodeError:
                        print(f"Failed to parse JSON from API response: {content}")
                        page_results = {}
                        patterns = {
                            'title': r'"title":\s*"([^"]*)",?',
                            'date': r'"date":\s*"([^"]*)",?',
                            'id_number': r'"id_number":\s*"([^"]*)",?'
                        }
                        for key, pattern in patterns.items():
                            match = re.search(pattern, content)
                            if match:
                                page_results[key] = match.group(1)
                    
                    # Only update NA items
                    for key in na_items[:]:  # Create a copy to modify during iteration
                        if page_results.get(key, "NA") != "NA":
                            results[key] = page_results[key]
                            na_items.remove(key)
                            print(f"Found {key} on page {page_num}")
                    
                    break  # Break retry loop if successful
                    
                except Exception as e:
                    print(f"Error on attempt {attempt + 1} for page {page_num} of {pdf_path}: {e}")
                    if attempt == max_retries - 1:
                        print(f"Failed all {max_retries} attempts for page {page_num}")
                    else:
                        time.sleep(2 ** attempt)
                        continue
    
    return results
  # Number of documents to process per directory in test mode

# Load API key
with open('../../../../openai_eis_key.txt', encoding="utf-8") as f:
    api_key = f.read().strip()

# List of input directories containing PDFs
pdf_dirs = [
    '../../ria_documents/epa_corpus/epa_pdfs/',
    '../../ria_documents/epa_corpus/epa_air_pdfs/',
    '../../ria_documents/epa_corpus/epa_pdfs_eias/',
    '../../ria_documents/epa_corpus/epa_air_eias_pdfs/'
]
output_dir = '../../ria_documents/epa_corpus/processed_front_matter/'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Keep track of processed files and errors
processed_files = 0
error_files = []
skipped_files = []

print("\nStarting processing with settings:")
print(f"CLOBBER: {CLOBBER} ({'Overwriting' if CLOBBER else 'Preserving'} existing results)")
print(f"TEST: {TEST} ({TEST_DOCS} documents per directory if enabled)")

# Process PDFs from all directories
for pdf_dir in pdf_dirs:
    print(f"\nProcessing files from directory: {pdf_dir}")
    
    # Skip if directory doesn't exist
    if not os.path.exists(pdf_dir):
        print(f"Directory not found: {pdf_dir}")
        continue
        
    # Process all PDFs in current directory
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
    
    # If in test mode, limit the number of files
    if TEST:
        pdf_files = pdf_files[:TEST_DOCS]
        print(f"TEST MODE: Processing {len(pdf_files)} files from {pdf_dir}")
    
    total_files = len(pdf_files)
    print(f"Found {total_files} PDF files to process in {pdf_dir}")
    
    for i, pdf_file in enumerate(pdf_files, 1):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        print(f"\nProcessing file {i}/{total_files}: {pdf_file}")
        
        # Extract just the final directory name for the output structure
        dir_name = os.path.basename(os.path.normpath(pdf_dir))
        specific_output_dir = os.path.join(output_dir, dir_name)
        os.makedirs(specific_output_dir, exist_ok=True)
        
        # Check if file has already been processed
        output_file = os.path.join(specific_output_dir, f"{re.sub('pdf$|PDF$', '.json',pdf_file)}")
        if os.path.exists(output_file) and not CLOBBER:
            print(f"Skipping {pdf_file} - already processed (use CLOBBER=True to overwrite)")
            skipped_files.append(pdf_file)
            continue
        
        try:
            # Process the document
            results = process_document(pdf_path, api_key)
            
            # Save results
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=4)
            
            print(f"Completed processing {pdf_file}")
            processed_files += 1
            
            # Add a small delay between files to avoid rate limiting
            time.sleep(1)
            
        except Exception as e:
            error_message = f"Error processing {pdf_file}: {str(e)}"
            print(error_message)
            error_files.append((pdf_file, str(e)))
            continue
        
        # Print progress
        if i % 10 == 0 or i == total_files:
            print(f"\nProgress update for {dir_name}:")
            print(f"Processed {processed_files} files successfully")
            print(f"Skipped {len(skipped_files)} existing files")
            print(f"Encountered errors in {len(error_files)} files")
            print(f"Current directory: {i}/{total_files} files processed")

# Print final summary
print("\n" + "="*50)
print("PROCESSING COMPLETE")
print("="*50)
print(f"Mode: {'TEST' if TEST else 'FULL'}")
print(f"Clobber: {'ON' if CLOBBER else 'OFF'}")
print(f"\nSuccessfully processed: {processed_files} files")
print(f"Skipped existing files: {len(skipped_files)}")
print(f"Errors encountered: {len(error_files)}")

# Save detailed report
report_path = os.path.join(output_dir, 'processing_report.json')
report = {
    'configuration': {
        'test_mode': TEST,
        'test_docs_per_directory': TEST_DOCS if TEST else None,
        'clobber': CLOBBER,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
    },
    'statistics': {
        'total_processed': processed_files,
        'total_skipped': len(skipped_files),
        'total_errors': len(error_files)
    },
    'skipped_files': skipped_files,
    'error_files': [{
        'file': file,
        'error': error
    } for file, error in error_files]
}

with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(report, f, ensure_ascii=False, indent=4)
print(f"\nDetailed report saved to: {report_path}")


Starting processing with settings:
CLOBBER: False (Preserving existing results)
TEST: False (1 documents per directory if enabled)

Processing files from directory: ../../ria_documents/epa_corpus/epa_pdfs/
Found 322 PDF files to process in ../../ria_documents/epa_corpus/epa_pdfs/

Processing file 1/322: ZyPDF.cgi?Dockey=P100TFH1.PDF
Skipping ZyPDF.cgi?Dockey=P100TFH1.PDF - already processed (use CLOBBER=True to overwrite)

Processing file 2/322: P10175J2.PDF?Dockey=P10175J2.PDF
Skipping P10175J2.PDF?Dockey=P10175J2.PDF - already processed (use CLOBBER=True to overwrite)

Processing file 3/322: ZyPDF.cgi?Dockey=P100SFJ0.PDF
Skipping ZyPDF.cgi?Dockey=P100SFJ0.PDF - already processed (use CLOBBER=True to overwrite)

Processing file 4/322: ZyPDF.cgi?Dockey=P1004LNN.PDF
Skipping ZyPDF.cgi?Dockey=P1004LNN.PDF - already processed (use CLOBBER=True to overwrite)

Processing file 5/322: ZyPDF.cgi?Dockey=P100U02B.PDF
Skipping ZyPDF.cgi?Dockey=P100U02B.PDF - already processed (use CLOBBER=True t