This notebook was tested in a `ml.t3.medium` instance and Sagemaker`Data Science 3` image Studio Notebook

<img src="chatbot.png" width="800"/>

This sample notebooks implements a general chatbot.
Key functionalities include:
1. Saving of Conversation History in DynamoDB
2. Handling Document upload for various supported document format (PDF, JPG, CSV, EXCEL, PNG, TXT, JSON) by passing the document local or S3 path.
3. Implementing various prompt template store locally (can also be stored in S3)

Install required packages

In [None]:
!pip install boto3 -U langchain -U
!pip install anthropic
!pip install s3fs -U
!pip install pandas -U
!pip install --force-reinstall amazon-textract-textractor==1.7.1

In [None]:
import boto3
from anthropic import Anthropic
from botocore.config import Config
import shutil
import os
import pandas as pd
import json
import io
import re
import numpy as np
import openpyxl
from openpyxl.cell import Cell
from openpyxl.worksheet.cell_range import CellRange
import uuid
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures
from langchain.llms.bedrock import Bedrock
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from textractor.data.text_linearization_config import TextLinearizationConfig

#### Initialize Bedrock Runtime

In [4]:
# Create the bedrock runtime to invoke LLM
config = Config(
    read_timeout=600,
    retries = dict(
        max_attempts = 5 ## Handle retries
    )
)
import boto3
bedrock_runtime = boto3.client(service_name='bedrock-runtime',region_name='us-east-1',config=config)

#### Create DynamoDB Table
A DynamoDB Table is created with a user ID as partition Key and Session ID as sort key. 
This enables saving multiple chat session history under the same user id.\
Provide a bucket name that would be used to cache Amazon Textract results for document OCR.

In [21]:
DYNAMODB_TABLE="SessionChatHistory" 
DYNAMODB_USER= "test-user"
SESSIONID=str(uuid.uuid4())
DYNAMODB  = boto3.resource('dynamodb')
dynamo=boto3.client('dynamodb')
chat_hist=[]
BUCKET="ENTER S3 BUCKET NAME"
S3=boto3.client('s3')

In [None]:
import boto3
try:
    table = DYNAMODB.create_table(
        TableName=DYNAMODB_TABLE,
        KeySchema=[
            {
                'AttributeName': 'UserId',  # Partition key
                'KeyType': 'HASH'  
            },
            {
                'AttributeName': 'SessionId',   # Sort key
                'KeyType': 'RANGE'
            }
        ],
        AttributeDefinitions=[
            {
                'AttributeName': 'UserId',
                'AttributeType': 'S'   # String data type
            },
            {
                'AttributeName': 'SessionId',
                'AttributeType': 'S'
            },
        ],
        BillingMode='PAY_PER_REQUEST'  # On-demand billing
    )

    print("Table status:", table.table_status)

    # Wait until the table exists.
    table.meta.client.get_waiter("table_exists").wait(TableName="SessionChatHistory")
    print(table.item_count)
except dynamo.exceptions.ResourceInUseException as e:
    print(e.response['Error']['Message'])

#### Utility Functions

In [23]:
def strip_newline(cell):
    """
    A utility function to strip newline characters from a cell.
    Parameters:
    cell (str): The cell value.
    Returns:
    str: The cell value with newline characters removed.
    """
    return str(cell).strip()

def table_parser_utills(file):    
    """
    Converts an Excel table to a csv string, 
    handling duplicated values across merged cells.

    Args:
        file: Excel table  
    Returns: 
        Pandas DataFrame representation of the Excel table
    """
    # Read from S3 or local
    if "s3://" in file:
        s3 = boto3.client('s3')
        match = re.match("s3://(.+?)/(.+)", file)
        if match:
            bucket_name = match.group(1)
            key = match.group(2)    
            obj = s3.get_object(Bucket=bucket_name, Key=key)  
        # Read Excel file from S3 into a buffer
        xlsx_buffer = io.BytesIO(obj['Body'].read())
        xlsx_buffer.seek(0) 
        # Load workbook, get active worksheet
        wb = openpyxl.load_workbook(xlsx_buffer)
        worksheet = wb.active
    else:
        # Load workbook, get active worksheet
        wb = openpyxl.load_workbook(file)
        worksheet = wb.active
    # Unmerge cells, duplicate merged values to individual cells
    all_merged_cell_ranges: list[CellRange] = list(
            worksheet.merged_cells.ranges
        )
    for merged_cell_range in all_merged_cell_ranges:
        merged_cell: Cell = merged_cell_range.start_cell
        worksheet.unmerge_cells(range_string=merged_cell_range.coord)
        for row_index, col_index in merged_cell_range.cells:
            cell: Cell = worksheet.cell(row=row_index, column=col_index)
            cell.value = merged_cell.value
    # determine table header index
    df = pd.DataFrame(worksheet.values)
    df=df.map(strip_newline)  
    return df.to_csv(sep="|", index=False)

In [24]:
def get_s3_keys(prefix):
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=BUCKET, Prefix=prefix)
    keys=""
    if "Contents" in response:
        keys = []
        for obj in response['Contents']:
            key = obj['Key']
            name = key[len(prefix):]
            keys.append(name)
    return keys

In [25]:
def exract_pdf_text_aws(file):
    """
    Extract text from PDF/image files using Amazon Textract service.
    Supports PDFs/Images stored locally or in S3.
    
    Parameters:
        file (str): Path or S3 URI of PDF file

    Returns:
        text (str): Extracted text from PDF

    """
    
    file_base_name=os.path.basename(file)
    # Checking if extracted doc content is in S3
    if [x for x in get_s3_keys("extracted_output/") if file_base_name in x]:      
        response = S3.get_object(Bucket=BUCKET, Key=f"extracted_output/{file_base_name}.txt")
        text = response['Body'].read()
        return text
    else:
        dir_name, ext = os.path.splitext(file)
        extractor = Textractor(region_name="us-east-1")
        # Asynchronous call, you will experience some wait time. Try caching results for better experience
        if "s3://" in file and "pdf" in ext:
            print("Asynchronous call, you may experience some wait time.")
            document = extractor.start_document_analysis(
            file_source=file,
            features=[TextractFeatures.LAYOUT,TextractFeatures.TABLES],       
            save_image=False,   
            s3_output_path=f"s3://{BUCKET}/textract_output/"
        )
        # Asynchronous call, you will experience some wait time. Try caching results for better experience
        elif "s3://" not in file and "pdf" in ext:
            print("Asynchronous call, you may experience some wait time.")
            document = extractor.start_document_analysis(
            file_source=file,
            features=[TextractFeatures.LAYOUT,TextractFeatures.TABLES],
            save_image=False,  
            s3_upload_path=f"s3://{BUCKET}",
            s3_output_path=f"s3://{BUCKET}/textract_output/"
        )
        #Synchronous call
        else:
            document = extractor.analyze_document(
            file_source=file,
            features=[TextractFeatures.LAYOUT,TextractFeatures.TABLES],  
            save_image=False,
        )

        config = TextLinearizationConfig(
        hide_figure_layout=True,   
        hide_header_layout=False,    
        table_prefix="<table>",
        table_suffix="</table>",
        )
        # Upload extracted content to s3
        S3.put_object(Body=document.get_text(config=config), Bucket=BUCKET, Key=f"extracted_output/{file_base_name}.txt") 
        return document.get_text(config=config)
    

In [26]:
def handle_doc_upload_or_s3(file):
    """
    Handle parsing of documents from local file system or S3.

    Supports PDF, PNG, JPG, CSV, XLSX, JSON and Text files.

    Parameters:
        file (str): Path or S3 URI of document to parse

    Returns:
        content: Parsed contents of the file in appropriate format
    """
    dir_name, ext = os.path.splitext(file)
    if  ext in [".pdf", ".png", ".jpg"]:   
        content=exract_pdf_text_aws(file)
    elif "csv"  in ext:
        content= pd.read_csv(file)
    elif ext in [".xlsx", ".xlx"]:
        content=table_parser_utills(file)
    elif "json" in ext and "s3://" not in dir_name:
        with open(file) as json_file:       
            content = json.load(json_file)
    elif  "json" in ext and "s3://" in dir_name:
        s3 = boto3.client('s3')
        match = re.match("s3://(.+?)/(.+)", file)
        if match:
            bucket_name = match.group(1)
            key = match.group(2)    
            obj = s3.get_object(Bucket=bucket_name, Key=key)        
            content = json.loads(obj['Body'].read())
    elif "txt" in ext and "s3://" not in dir_name:
        with open(file, "r") as txt_file:       
            content = txt_file.read()
    elif  "txt" in ext and "s3://" in dir_name:
        s3 = boto3.client('s3')
        match = re.match("s3://(.+?)/(.+)", file)
        if match:
            bucket_name = match.group(1)
            key = match.group(2)    
            obj = s3.get_object(Bucket=bucket_name, Key=key)        
            content = obj['Body'].read()
    # Implement any of file extension logic 
    return content

In [27]:
def put_db(messages):
    """Store long term chat history in DynamoDB"""    
    chat_item = {
        "UserId": DYNAMODB_USER, # user id
        "SessionId": SESSIONID, # User session id
        "messages": [messages]  # 'messages' is a list of dictionaries
    }
    existing_item = DYNAMODB.Table(DYNAMODB_TABLE).get_item(Key={"UserId": DYNAMODB_USER, "SessionId":SESSIONID})
    if "Item" in existing_item:
        existing_messages = existing_item["Item"]["messages"]
        chat_item["messages"] = existing_messages + [messages]

    response = DYNAMODB.Table(DYNAMODB_TABLE).put_item(
        Item=chat_item
    )    

#### Chat Function

This function calls the Anthropic Claude Bedrock api. 

In [34]:
def conversation_bedroc_chat_(question, upload_doc=None):
    """
    Function takes a user query and a document path (from S3 or Local)
    passing a document path is optional
    """
    current_chat=""
    # Retrieve past chat history from Dynamodb
    if DYNAMODB_TABLE:
        chat_histories = DYNAMODB.Table(DYNAMODB_TABLE).get_item(Key={"UserId": DYNAMODB_USER, "SessionId":SESSIONID})
        if "Item" in chat_histories:
            chat_hist=chat_histories['Item']['messages']
            # Returning the latest 10 conversation turns
            for chat in chat_histories['Item']['messages'][-10:]:
                for k, v in chat.items():
                    current_chat+=v
        else:
            chat_hist=[]
    else:
        for chat in chat_hist:
            for k, v in chat.items():
                current_chat+=v
    ## prompt template for when a user uploads a doc
    if upload_doc:
        doc=handle_doc_upload_or_s3(upload_doc)
        with open("prompt/doc_chat.txt","r") as f:
            chat_template=f.read()
        values = {
        "doc": doc,
        "prompt": question,
        "current_chat": current_chat,
        }
        prompt=f"\n\nHuman: {chat_template.format(**values)}\n\nAssistant:"    
    else:        
        # Chat template for open ended query
        with open("prompt/chat.txt","r") as f:
            chat_template=f.read()
        values = {
        "prompt": question,
        "current_chat": current_chat,
        }
        prompt=f"\n\nHuman: {chat_template.format(**values)}\n\nAssistant:"

    inference_modifier = {'max_tokens_to_sample':1500, 
                          "temperature":0.5,
                          # "top_k":250,
                          # "top_p":1,    
                          "stop_sequences": ["Human:"]
                         }
    llm = Bedrock(model_id='anthropic.claude-v2',  # Change to a different claude model id
                  client=bedrock_runtime, model_kwargs = inference_modifier,
                  streaming=True,  # Toggle this to turn streaming on or off
                  callbacks=[StreamingStdOutCallbackHandler() ])

    response = llm.invoke(prompt)
    chat_history={"user": f"{question}",
    "assiatant":f"\n\nAssistant: {response}\n\nHuman: "} 
    #store conversation memory in DynamoDB table
    if DYNAMODB_TABLE:
        put_db(chat_history)
    # use local memory for storage
    else:
        chat_hist.append(chat_history)   
    return response

#### Query the the chat bot with your questions.
Also takes a document path stored in s3 or local. Once a document path is passed, a different prompt template is triggered.
However, chat history (question and response only) are store in the DynamoDB table.


In [38]:
question="Hello"
res=conversation_bedroc_chat_(question)

 Hello there! I'm happy to converse with you and provide helpful information to the best of my abilities. As an AI assistant created by Anthropic to be helpful, harmless, and honest, I will do my best to give high quality responses in a friendly markdown format. Please feel free to ask me anything!