## Analyzes text in a document stored in an S3 bucket

Display polygon box around text and angled text. The scripts draw the document image with the following colored bounding boxes:
- Red: KEY Block objects
- Green: VALUE Block objects
- Blue: TABLE Block objects
- Yellow: CELL Block objects
- Cyan: LINE objects

In [None]:
import json
import io
import math
import sys
from pathlib import Path
from io import BytesIO

import boto3
import PIL
from PIL import Image, ImageDraw, ImageFont
from pdf2image import convert_from_path

bucket_name='my-bucket'

Path('output').mkdir(exist_ok=True)
Path('tmp').mkdir(exist_ok=True)

In [None]:
def DrawDetectedElements(image: 'PIL image', response: dict) -> 'PIL image':
    """Draw detected elements on the image
    Args:
        image (PIL image): A PIL image from an original document
        response (dict): A dictionlay returned from Amazon textract python api.
    Return:
        image (PIL image): A PIL image detected elements are added. 
    """   
    # Get the text blocks
    blocks=response['Blocks']
    width, height =image.size  
    draw = ImageDraw.Draw(image)
    
        
    # Create image showing bounding box/polygon the detected lines/text
    for block in blocks:
        # DisplayBlockInformation(block)
        draw=ImageDraw.Draw(image)
            
        if block['BlockType'] == "KEY_VALUE_SET":
            if block['EntityTypes'][0] == "KEY":
                ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'red')
            else:
                ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'green')  
            
        if block['BlockType'] == 'TABLE':
            ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'blue')

        if block['BlockType'] == 'CELL':
            ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'yellow')
        
        if block['BlockType'] == 'LINE':
            ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'cyan')
    
        if block['BlockType'] == 'SELECTION_ELEMENT':
            if block['SelectionStatus'] =='SELECTED':
                ShowSelectedElement(draw, block['Geometry']['BoundingBox'],width,height, 'blue')
    
            points=[]
            for polygon in block['Geometry']['Polygon']:
                points.append((width * polygon['X'], height * polygon['Y']))
                draw.polygon((points), outline='blue')
                
    return image

In [None]:
def ShowBoundingBox(draw: PIL.ImageDraw, box: dict, width: int,height: int, boxColor: str):
    """Show a bounding box in a given ImageDraw.
    Args:
        draw: (PIL.ImageDraw): A PIL.ImageDraw from an original document.
        box (dict): A dict which has 'Width', 'Height', 'Left', 'Top' as keys.
        width, height (int): Width and height from a document image.
        boxColor (str): Color for box outline.
    """             
    left = width * box['Left']
    top = height * box['Top'] 
    draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],outline=boxColor)

In [None]:
def ShowSelectedElement(draw: PIL.ImageDraw, box: dict, width: int,height: int, boxColor: str):
    """Show a selected element box in a given ImageDraw.
    Args:
        draw: (PIL.ImageDraw): A PIL.ImageDraw from an original document.
        box (dict): A dict which has 'Width', 'Height', 'Left', 'Top' as keys.
        width, height (int): Width and height from a document image.
        boxColor (str): Color for box outline.
    """       
    left = width * box['Left']
    top = height * box['Top'] 
    draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],fill=boxColor) 

In [None]:
s3_connection  = boto3.resource('s3')
bucket = s3_connection.Bucket(bucket_name)
objs = bucket.meta.client.list_objects_v2(Bucket=bucket.name)

filelist = [obj.get('Key') for obj in objs.get('Contents')]
print(filelist)

In [None]:
s3_client = boto3.client('s3')

for file in filelist:
    print('start text detection in ' + file)
    
    # Jpeg or png files are converted to binary image objects
    if Path(file).suffix == '.jpg' or Path(file).suffix == '.png':
        s3_object = s3_connection.Object(bucket_name, file)
        s3_response = s3_object.get()
        stream = io.BytesIO(s3_response['Body'].read())
        image=Image.open(stream)
        image_binary = stream.getvalue()
    
    # PDF files are converted to binary image objects
    elif Path(file).suffix == '.pdf':
       
        s3_client.download_file(bucket_name, file, 'tmp/'+ file)
        image = convert_from_path('tmp/' + file)[0]
        Path('tmp/' + file).unlink()
        
        output = io.BytesIO()
        image.save(output, format='JPEG')
        image_binary = output.getvalue()
        
    else:
        print('Unexpected file type found ' + file)
        continue
    
    # Analyze the document
    textract = boto3.client('textract')
    response = textract.analyze_document(Document={'Bytes': image_binary},FeatureTypes=["TABLES", "FORMS"])

    # Save the response as json file
    response_path = 'output' / Path(file).with_suffix('.json')
    with response_path.open(mode="w") as f:
        print('saving the response as' + str(response_path))
        json.dump(response, f, indent=4)
    
    # Draw detected elements on the original file and save the file as png
    drawed_image = DrawDetectedElements(image, response)
    image_path = 'output' / Path(file).with_suffix('.png')
    drawed_image.save(image_path)      
    print('saving the image file as' + str(image_path))
    
Path('tmp').rmdir()