In [None]:
!pip install --force-reinstall amazon-textract-textractor==1.7.1
!pip install pillow
!pip install pymupdf

<img src="images/classifier.png" width="800" height=700/>

In [2]:
import os
from PIL import Image
import pandas as pd
import re
import base64
import json
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures
import boto3
# Create the bedrock runtime to invoke LLM
from botocore.config import Config
config = Config(
    read_timeout=600, #this timeout determines the maximum time (secs) allowed for the client to wait for data to be received from the server. 
    retries = dict(
        max_attempts = 5 ## maximum number of retry attempts that will be made on a single request
    )
)
region="us-east-1"
bedrock_runtime = boto3.client(service_name='bedrock-runtime',region_name=region,config=config)

Utility Functions

In [3]:
def _invoke_bedrock_with_retries(system_message, prompt, model_id,image_path=None):
    max_retries = 5
    backoff_base = 2
    max_backoff = 3  # Maximum backoff time in seconds
    retries = 0
    while True:
        try:
            response, input_token, output_token = bedrock_claude_(system_message, prompt,model_id,image_path)
            return response, input_token, output_token
        except ClientError as e:
            if e.response['Error']['Code'] == 'ThrottlingException':
                if retries < max_retries:
                    # Throttling, exponential backoff
                    sleep_time = min(max_backoff, backoff_base ** retries + random.uniform(0, 1))
                    time.sleep(sleep_time)
                    retries += 1
                else:
                    raise e
            else:
                # Some other API error, rethrow
                raise

Utility functions to call Bedrock Claude model

In [4]:
def bedrock_streemer(response):
    stream = response.get('body')
    answer = ""
    i = 1
    if stream:
        for event in stream:
            chunk = event.get('chunk')
            if  chunk:
                chunk_obj = json.loads(chunk.get('bytes').decode())
                if "delta" in chunk_obj:                    
                    delta = chunk_obj['delta']
                    if "text" in delta:
                        text=delta['text'] 
                        print(text, end="")
                        answer+=str(text)       
                        i+=1
                if "amazon-bedrock-invocationMetrics" in chunk_obj:
                    input_tokens= chunk_obj['amazon-bedrock-invocationMetrics']['inputTokenCount']
                    output_tokens=chunk_obj['amazon-bedrock-invocationMetrics']['outputTokenCount']
                    print(f"\nInput Tokens: {input_tokens}\nOutput Tokens: {output_tokens}")
    return answer,input_tokens, output_tokens

def bedrock_claude_(system_message, prompt,model_id,image_path=None):
    content=[{
        "type": "text",
        "text": prompt
            }]
    if image_path:       
        if not isinstance(image_path, list):
            image_path=[image_path]      
        for img in image_path:
            s3 = boto3.client('s3')
            match = re.match("s3://(.+?)/(.+)", img)
            image_name=os.path.basename(img)
            _,ext=os.path.splitext(image_name)
            if "jpg" in ext: ext=".jpeg"                        
            if match:
                bucket_name = match.group(1)
                key = match.group(2)    
                obj = s3.get_object(Bucket=bucket_name, Key=key)
                base_64_encoded_data = base64.b64encode(obj['Body'].read())
                base64_string = base_64_encoded_data.decode('utf-8')
            else:
                with open(img, "rb") as image_file:
                    binary_data = image_file.read()
                base_64_encoded_data = base64.b64encode(binary_data)
                base64_string = base_64_encoded_data.decode('utf-8')
            content.extend([{"type":"text","text":image_name},{
              "type": "image",
              "source": {
                "type": "base64",
                "media_type": f"image/{ext.lower().replace('.','')}",
                "data": base64_string
              }
            }])

    prompt = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2500,
        "temperature": 0.5,
        "system":system_message,
        "messages": [    
            {
                "role": "user",
                "content": content
            }
        ]
    }
    answer = ""
    prompt = json.dumps(prompt)
    response = bedrock_runtime.invoke_model_with_response_stream(body=prompt, modelId=model_id, accept="application/json", contentType="application/json")
    answer,input_tokens,output_tokens=bedrock_streemer(response) 
    return answer, input_tokens, output_tokens

[Amazon Textract Linearization Library](https://aws-samples.github.io/amazon-textract-textractor/examples.html) to call Textract and parse the results properly

In [5]:
from textractor.data.text_linearization_config import TextLinearizationConfig

configs = TextLinearizationConfig(
    hide_figure_layout=True,
    hide_header_layout=False,
    table_prefix="<table>",
    table_suffix="</table>",
    hide_footer_layout=True,
    hide_page_num_layout=True,
)

In [6]:
use_claude_image=False #Use Claude 3 Image process as opposed to Textract

doc_path="docs" # local path to documents
files=os.listdir(doc_path)
# S3 location to store pdf if passing local pdf file to Textract api call
textract_output_bucket="BUCKET NAME"

if not use_claude_image:
    doc=""
    extractor = Textractor(region_name="us-east-1")
    # Classify each page in a PDF (True) or treat the entire PDF as a class (False)
    label_per_page=True 
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):        
            document = extractor.analyze_document(
                file_source=f"{doc_path}/{file}",
                features=[TextractFeatures.LAYOUT,],#TextractFeatures.FORMS,TextractFeatures.TABLES],
                save_image=False,       
            )
            doc+=f"<{os.path.basename(file)}>\n{document.get_text(config=configs)}\n</{os.path.basename(file)}>\n"  
        elif file.lower().endswith(('.pdf')): 
            document = extractor.start_document_analysis(
                    file_source=f"{doc_path}/{file}",
                    features=[TextractFeatures.LAYOUT,],#TextractFeatures.TABLES,TextractFeatures.FORMS],
                    save_image=False,
                    s3_upload_path = f"s3://{textract_output_bucket}/textract/" # Comment this line if you are passing pdf from s3
                )
            if label_per_page:
                for page in range(len(document.pages)):
                    doc+=f"<{os.path.basename(file)}_Page{str(page+1)}>\n{document.pages[page].get_text(config=configs)}\n</{os.path.basename(file)}_Page{str(page+1)}>\n"
            else:
                doc+=f"<{os.path.basename(file)}>\n{document.get_text(config=configs)}\n</{os.path.basename(file)}>\n"  


In [7]:
# Create a manifest list of possible classes and their description
possible_class={"drivers license":"This is a US drivers license",
               "W2":"This is a tax reporting form",
               "Bank Statement": "This is personal bank document",
               "PayStub": "This is an individual's pay info"}

In [8]:
image_list=[]
# System Prompt
with open(f"prompt/system.txt","r") as f:
    system_template=f.read()
#Claude 3 image prompt
if use_claude_image:
    with open(f"prompt/sorter_image.txt","r") as f:
        prompt=f.read()
    prompt=prompt.replace("{label}",json.dumps(possible_class))
    image_list=[f"{doc_path}/{x}" for x in files if x.lower().endswith(('.png', '.jpg', '.jpeg'))]
else:
# Textract with Claude 3 prompt
    with open(f"prompt/sorter.txt","r") as f:
        prompt=f.read()
    prompt=prompt.replace("{doc}",doc).replace("{label}",json.dumps(possible_class))

In [9]:
modell="anthropic.claude-3-sonnet-20240229-v1:0"
response, iput_t, output_t=_invoke_bedrock_with_retries(system_template, prompt,modell,image_list)

{
 "1":{"document name":"johndoc4.pdf0.jpg",
      "label":"Bank Statement"},
 "2":{"document name":"johndoc3.pdf_Page1",
      "label":"Bank Statement"},
 "3":{"document name":"janedoc1.png",
      "label":"PayStub"},
 "4":{"document name":"timothydoc1.PNG",
      "label":"drivers license"},
 "5":{"document name":"timothydoc5.PNG",
      "label":"drivers license"},
 "6":{"document name":"timothydoc3.PNG",
      "label":"drivers license"},
 "7":{"document name":"janedoc3.jpg",
      "label":"W2"},
 "8":{"document name":"john-doc1.PNG",
      "label":"drivers license"},
 "9":{"document name":"johndoc2.pdf0.jpg",
      "label":"Bank Statement"},
 "10":{"document name":"johndoc4.pdf_Page1",
       "label":"Bank Statement"},
 "11":{"document name":"janedoc2.jpg",
       "label":"W2"},
 "12":{"document name":"sarahdoc1.jpg",
       "label":"W2"},
 "13":{"document name":"5937faca67a8e.jpg",
       "label":"PayStub"},
 "14":{"document name":"jandedoc4.jpg",
       "label":"W2"},
 "15":{"docum

In [126]:
json.loads(response)

{'1': {'document name': 'johndoc4.pdf0.jpg', 'label': 'Bank Statement'},
 '2': {'document name': 'johndoc3.pdf', 'label': 'Bank Statement'},
 '3': {'document name': 'janedoc1.png', 'label': 'PayStub'},
 '4': {'document name': 'timothydoc1.PNG', 'label': 'drivers license'},
 '5': {'document name': 'timothydoc5.PNG', 'label': 'drivers license'},
 '6': {'document name': 'timothydoc3.PNG', 'label': 'drivers license'},
 '7': {'document name': 'janedoc3.jpg', 'label': 'W2'},
 '8': {'document name': 'john-doc1.PNG', 'label': 'drivers license'},
 '9': {'document name': 'johndoc2.pdf0.jpg', 'label': 'Bank Statement'},
 '10': {'document name': 'johndoc4.pdf', 'label': 'Bank Statement'},
 '11': {'document name': 'janedoc2.jpg', 'label': 'W2'},
 '12': {'document name': 'sarahdoc1.jpg', 'label': 'W2'},
 '13': {'document name': '5937faca67a8e.jpg', 'label': 'PayStub'},
 '14': {'document name': 'jandedoc4.jpg', 'label': 'W2'},
 '15': {'document name': 'johndoc3.pdf0.jpg', 'label': 'Bank Statement'},
