In [1]:
import boto3
import time
import json
import logging
import os

In [2]:
# Set up logging if not already done
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [9]:
# --- CONFIGURATION ---
S3_BUCKET_NAME = "resume-parser-waijian-20250525"
# Let's target one specific folder first
S3_PREFIX = "resume/ACCOUNTANT/"
TEXTRACT_ROLE_ARN = 'arn:aws:iam::747549824523:role/TextractS3AccessRole'
AWS_REGION = "ap-southeast-1"
# --- END CONFIGURATION ---

In [4]:
# Create Boto3 clients
s3_client = boto3.client('s3', region_name=AWS_REGION)
textract_client = boto3.client('textract', region_name=AWS_REGION)

logging.info(f"Setup complete. Bucket: {S3_BUCKET_NAME}, Prefix: {S3_PREFIX}")

2025-05-26 01:18:00,346 - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2025-05-26 01:18:00,414 - INFO - Setup complete. Bucket: resume-parser-waijian-20250525, Prefix: resume/ACCOUNTANT/


In [5]:
def list_s3_files(bucket, prefix, max_keys=10):
    """Lists files in an S3 bucket under a given prefix."""
    try:
        response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=max_keys)
        files = [item['Key'] for item in response.get('Contents', []) if item['Key'].lower().endswith('.pdf')]
        logging.info(f"Found {len(files)} PDFs in S3 under {prefix} (showing max {max_keys}).")
        return files
    except Exception as e:
        logging.error(f"Error listing S3 files: {e}")
        return []

pdf_files_to_process = list_s3_files(S3_BUCKET_NAME, S3_PREFIX, max_keys=5) # Let's start with 5
print("Files to process:")
print(pdf_files_to_process)

2025-05-26 01:22:54,231 - INFO - Found 5 PDFs in S3 under resume/ACCOUNTANT/ (showing max 5).


Files to process:
['resume/ACCOUNTANT/10554236.pdf', 'resume/ACCOUNTANT/10674770.pdf', 'resume/ACCOUNTANT/11163645.pdf', 'resume/ACCOUNTANT/11759079.pdf', 'resume/ACCOUNTANT/12065211.pdf']


In [16]:
def start_textract_job(bucket, document_key):
    """Starts an asynchronous Textract job."""
    if not document_key:
        logging.error("No document key provided.")
        return None

    logging.info(f"Starting Textract job for: {document_key}")
    try:
        response = textract_client.start_document_text_detection(
            DocumentLocation={
                'S3Object': {
                    'Bucket': bucket,
                    'Name': document_key
                }
            }
        )
        job_id = response['JobId']
        logging.info(f"Started Job with ID: {job_id}")
        return job_id
    except Exception as e:
        logging.error(f"Error starting Textract job for {document_key}: {e}")
        return None

# Start a job for the *first* PDF in our list (if any)
current_job_id = None
if pdf_files_to_process:
    first_pdf = pdf_files_to_process[0]
    current_job_id = start_textract_job(S3_BUCKET_NAME, first_pdf)
else:
    logging.warning("No PDFs found to process.")

2025-05-26 01:34:22,662 - INFO - Starting Textract job for: resume/ACCOUNTANT/10554236.pdf
2025-05-26 01:34:24,070 - INFO - Started Job with ID: 2491be29f3283a5ac36b2ae181dc30c4b850d7971040eee5d9aac3f0451c28ab


In [18]:
def check_textract_job_status(job_id):
    """Checks the status of a Textract job."""
    if not job_id:
        logging.error("No Job ID provided.")
        return None

    logging.info(f"Checking status for Job ID: {job_id}")
    try:
        response = textract_client.get_document_text_detection(JobId=job_id)
        status = response['JobStatus']
        logging.info(f"Current Status: {status}")
        return status
    except Exception as e:
        logging.error(f"Error checking job status {job_id}: {e}")
        return "FAILED" # Treat errors as failures

def wait_for_job_completion(job_id, delay=5, timeout=300):
    """Waits for a Textract job to complete by polling."""
    if not job_id: return False

    start_time = time.time()
    while time.time() - start_time < timeout:
        status = check_textract_job_status(job_id)
        if status == 'SUCCEEDED':
            logging.info(f"Job {job_id} SUCCEEDED!")
            return True
        elif status in ['FAILED', 'PARTIAL_SUCCESS']:
            logging.error(f"Job {job_id} finished with status: {status}")
            return False
        
        logging.info(f"Waiting for {delay} seconds...")
        time.sleep(delay)

    logging.error(f"Job {job_id} timed out after {timeout} seconds.")
    return False


# Wait for the job we started (if any)
job_succeeded = False
if current_job_id:
    job_succeeded = wait_for_job_completion(current_job_id)
else:
    logging.warning("No job was started.")

2025-05-26 01:37:32,109 - INFO - Checking status for Job ID: 2491be29f3283a5ac36b2ae181dc30c4b850d7971040eee5d9aac3f0451c28ab
2025-05-26 01:37:32,768 - INFO - Current Status: SUCCEEDED
2025-05-26 01:37:32,769 - INFO - Job 2491be29f3283a5ac36b2ae181dc30c4b850d7971040eee5d9aac3f0451c28ab SUCCEEDED!


In [20]:
def get_textract_results(job_id):
    """Retrieves all pages of results for a completed Textract job."""
    if not job_id: return None
    
    logging.info(f"Retrieving results for Job ID: {job_id}")
    all_blocks = []
    next_token = None
    
    try:
        while True:
            if next_token:
                response = textract_client.get_document_text_detection(JobId=job_id, NextToken=next_token)
            else:
                response = textract_client.get_document_text_detection(JobId=job_id)
            
            blocks = response.get('Blocks', [])
            all_blocks.extend(blocks)
            
            next_token = response.get('NextToken')
            if not next_token:
                break # No more pages
                
        logging.info(f"Retrieved {len(all_blocks)} blocks for Job ID: {job_id}")
        return all_blocks
        
    except Exception as e:
        logging.error(f"Error retrieving results for {job_id}: {e}")
        return None

def extract_text_from_blocks(blocks):
    """Extracts plain text (lines) from Textract blocks."""
    if not blocks: return ""
    
    lines = []
    for block in blocks:
        if block['BlockType'] == 'LINE':
            lines.append(block['Text'])
            
    return "\n".join(lines)

# Get and process results if the job succeeded
if job_succeeded and current_job_id:
    textract_blocks = get_textract_results(current_job_id)
    
    if textract_blocks:
        # Optional: Save the full JSON for inspection
        # with open(f'{current_job_id}.json', 'w') as f:
        #     json.dump(textract_blocks, f, indent=4)
        # logging.info(f"Saved full JSON to {current_job_id}.json")

        # Extract and print the plain text
        extracted_text = extract_text_from_blocks(textract_blocks)
        print("\n--- EXTRACTED TEXT (First 1000 chars) ---")
        print(extracted_text[:1000])
        print("----------------------------------------")
        
    else:
        logging.error("Failed to retrieve Textract blocks.")
else:
    logging.warning("Job did not succeed or no job was run. Cannot get results.")

2025-05-26 01:42:59,275 - INFO - Retrieving results for Job ID: 2491be29f3283a5ac36b2ae181dc30c4b850d7971040eee5d9aac3f0451c28ab
2025-05-26 01:43:00,283 - INFO - Retrieved 3732 blocks for Job ID: 2491be29f3283a5ac36b2ae181dc30c4b850d7971040eee5d9aac3f0451c28ab



--- EXTRACTED TEXT (First 1000 chars) ---
ACCOUNTANT
Summary
Financial Accountant specializing in financial planning, reporting and analysis within the Department of Defense.
Highlights
Account reconciliations
Results-oriented
Accounting operations professional
Financial reporting
Analysis of financial systems
Critical thinking
ERP (Enterprise Resource Planning) software.
Excellent facilitator
Accomplishments
Served on a tiger team which identified and resolved General Ledger postings in DEAMS totaling $360B in accounting adjustments. This allowed
for the first successful fiscal year-end close for 2012.
In collaboration with DFAS Europe, developed an automated tool that identified duplicate obligations. This tool allowed HQ USAFE to
deobligate over $5M in duplicate obligations.
Experience
Company Name July 2011 to November 2012 Accountant
City, State
Enterprise Resource Planning Office (ERO)
In this position as an Accountant assigned to the Defense Enterprise Accounting and Management