In [None]:
!pip install --force-reinstall amazon-textract-textractor==1.7.1

In [2]:
import os
from PIL import Image
import pandas as pd
import re
import base64
import json
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures
import boto3
from botocore.exceptions import ClientError
# Create the bedrock runtime to invoke LLM
from botocore.config import Config
config = Config(
    read_timeout=600, #this timeout determines the maximum time (secs) allowed for the client to wait for data to be received from the server. 
    retries = dict(
        max_attempts = 5 ## maximum number of retry attempts that will be made on a single request
    )
)
region="us-east-1"
bedrock_runtime = boto3.client(service_name='bedrock-runtime',region_name=region,config=config)

In [3]:
def _invoke_bedrock_with_retries(system_message, prompt, model_id,image_path=None):
    max_retries = 5
    backoff_base = 2
    max_backoff = 3  # Maximum backoff time in seconds
    retries = 0
    while True:
        try:
            response, input_token, output_token = bedrock_claude_(system_message, prompt,model_id,image_path)
            return response, input_token, output_token
        except ClientError as e:
            if e.response['Error']['Code'] == 'ThrottlingException':
                if retries < max_retries:
                    # Throttling, exponential backoff
                    sleep_time = min(max_backoff, backoff_base ** retries + random.uniform(0, 1))
                    time.sleep(sleep_time)
                    retries += 1
                else:
                    raise e
            else:
                # Some other API error, rethrow
                raise

def bedrock_streemer(response):
    stream = response.get('body')
    answer = ""
    i = 1
    if stream:
        for event in stream:
            chunk = event.get('chunk')
            if  chunk:
                chunk_obj = json.loads(chunk.get('bytes').decode())
                if "delta" in chunk_obj:                    
                    delta = chunk_obj['delta']
                    if "text" in delta:
                        text=delta['text'] 
                        print(text, end="")
                        answer+=str(text)       
                        i+=1
                if "amazon-bedrock-invocationMetrics" in chunk_obj:
                    input_tokens= chunk_obj['amazon-bedrock-invocationMetrics']['inputTokenCount']
                    output_tokens=chunk_obj['amazon-bedrock-invocationMetrics']['outputTokenCount']
                    print(f"\nInput Tokens: {input_tokens}\nOutput Tokens: {output_tokens}")
    return answer,input_tokens, output_tokens

def bedrock_claude_(system_message, prompt,model_id,image_path=None):
    content=[]
    if image_path:       
        if not isinstance(image_path, list):
            image_path=[image_path]      
        for img in image_path:
            s3 = boto3.client('s3')
            match = re.match("s3://(.+?)/(.+)", img)
            image_name=os.path.basename(img)
            _,ext=os.path.splitext(image_name)
            if "jpg" in ext: ext=".jpeg"                        
            if match:
                bucket_name = match.group(1)
                key = match.group(2)    
                obj = s3.get_object(Bucket=bucket_name, Key=key)
                base_64_encoded_data = base64.b64encode(obj['Body'].read())
                base64_string = base_64_encoded_data.decode('utf-8')
            else:
                with open(img, "rb") as image_file:
                    binary_data = image_file.read()
                base_64_encoded_data = base64.b64encode(binary_data)
                base64_string = base_64_encoded_data.decode('utf-8')
            content.extend([{"type":"text","text":image_name},{
              "type": "image",
              "source": {
                "type": "base64",
                "media_type": f"image/{ext.lower().replace('.','')}",
                "data": base64_string
              }
            }])

    content.append({
        "type": "text",
        "text": prompt
            })
    prompt = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2500,
        "temperature": 0.5,
        "system":system_message,
        "messages": [    
            {
                "role": "user",
                "content": content
            }
        ]
    }
    answer = ""
    prompt = json.dumps(prompt)
    response = bedrock_runtime.invoke_model_with_response_stream(body=prompt, modelId=model_id, accept="application/json", contentType="application/json")
    answer,input_tokens,output_tokens=bedrock_streemer(response) 
    return answer, input_tokens, output_tokens

## TEXTARCT TO EXTRACT PDF
**`(optional)`**
If you want to run summary or extract information from a pdf you can use Azazon Textract to extract the pdf and pass as context to the LLM call

In [None]:
file="LOCAL OR S# FILE PATH"
output_bucket="BUCKET NAME"
extractor = Textractor(region_name="us-east-1")

if file.startswith("s3://"):
    document = extractor.start_document_analysis(
        file_source=file,
        features=[TextractFeatures.LAYOUT,TextractFeatures.FORMS, TextractFeatures.TABLES],   
        save_image=False,
        s3_output_path=f"s3://{output_bucket}/textract-output/"
    )
else:
    document = extractor.start_document_analysis(
        file_source=file,
        features=[TextractFeatures.LAYOUT,TextractFeatures.FORMS, TextractFeatures.TABLES],   
        save_image=False,
        s3_upload_path=f"s3://{output_bucket}/uploaded_document/",
        s3_output_path=f"s3://{output_bucket}/textract-output/"
    )

In [109]:
from textractor.data.text_linearization_config import TextLinearizationConfig
configs = TextLinearizationConfig(
    hide_figure_layout=True,
    hide_header_layout=False,
    table_prefix="<table>",
    table_suffix="</table>",
    hide_footer_layout=True,
    hide_page_num_layout=True,
)
extracted_content=document.get_text(config=configs)
print(extracted_content)

## GENERATE

Anthropic claude is able to process images, you can pass a list of images as a paramter to the api call

In [4]:
model_id="anthropic.claude-3-sonnet-20240229-v1:0"#"anthropic.claude-v2","anthropic.claude-3-haiku-20240307-v1:0","anthropic.claude-3-sonnet-20240229-v1:0"
system_prompt="You are a an AI assistant."
prompt="What is the average time of sleep per day recommended?"
image_path=[] # local or s3 path to images. Images must be in jpeg, png or webp format (maximum of 20 images per api call)
summary=_invoke_bedrock_with_retries(system_prompt,prompt,model_id,image_path if "claude-3" in model_id else None)

The recommended amount of sleep per day varies by age group, but for most healthy adults, the general recommendation is:

7-9 hours per night

More specifically, the sleep recommendations from the National Sleep Foundation are:

- Newborns (0-3 months): 14-17 hours
- Infants (4-11 months): 12-15 hours  
- Toddlers (1-2 years): 11-14 hours
- Preschoolers (3-5 years): 10-13 hours
- School-age children (6-13 years): 9-11 hours
- Teens (14-17 years): 8-10 hours
- Adults (18-64 years): 7-9 hours
- Older Adults (65+ years): 7-8 hours

Getting sufficient quality sleep on a regular basis is important for physical and mental health. Individual sleep needs can vary somewhat based on factors like activity levels and genetic differences. But 7-9 hours is the general target for most adults to be well-rested.
Input Tokens: 25
Output Tokens: 247
