In [None]:
!pip install --force-reinstall amazon-textract-textractor==1.7.1
!pip install inflect
!pip install langchain

Installing the required libraries

In [None]:
import os
from PIL import Image
import pandas as pd
import re
import json
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
import io
import inflect
import boto3
from botocore.config import Config
from botocore.config import Config
config = Config(
    read_timeout=600, 
    retries = dict(
        max_attempts = 5 
    )
)
from langchain.llms.bedrock import Bedrock
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
bedrock_runtime = boto3.client(service_name='bedrock-runtime',region_name='us-east-1',config=config)

Utility functions for text generation and embedding generation using a few Models from Amazon SageMaker Jumpstart and Amazon Bedrock. This models were arbitrarily selected, however, you can modify to use the Models of your choice available on Bedrock, SageMaker JumpStart or HuggingFace.\
**Change the placeholders for sagemaker endpoint names for the various models below.**

In [None]:
def _get_emb_(passage, model):
    if "titan" in model:
        response = bedrock_runtime.invoke_model(body=json.dumps({"inputText":passage}),
                                    modelId="amazon.titan-embed-text-v1", 
                                    accept="application/json", 
                                    contentType="application/json")

        response_body = json.loads(response.get('body').read())
        embedding=response_body['embedding']
    elif "all-mini-lm" in model:
        payload = {'text_inputs': [passage]}
        payload = json.dumps(payload).encode('utf-8')

        response = SAGEMAKER.invoke_endpoint(EndpointName="ALL MINI LM v6 SAGEMAKER ENDPOINT", 
                                                    ContentType='application/json',  
                                                    Body=payload)

        model_predictions = json.loads(response['Body'].read())
        embedding = model_predictions['embedding'][0]
    return embedding


from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint
parameters ={
        "max_new_tokens": 100,       
        "return_full_text": False,
        "temperature": 0.1
}

class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"inputs": prompt, "parameters": model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json[0]["generated_text"]


content_handler = ContentHandler()
mixtral_instruct = SagemakerEndpoint(
    endpoint_name="MIXTRAL SAGEMAKER ENDPOINT",
    region_name="us-east-1",
    model_kwargs=parameters,
    content_handler=content_handler,
)

## Document Extraction
We would make use of Amazon 10K report as a sample document. We would upload the file to s3, change the **BUCKET NAME** placeholders to your bucket name in s3.\
We then call the textract `start document analysis` function to process the document. 

In [None]:
# upload data to s3
!aws s3 cp amzn-20221231.pdf s3://BUCKET-NAME

In [None]:
extractor = Textractor(region_name="us-east-1")

In [None]:
file="s3://BUCKET/amzn-20221231.pdf"
doc_id=file_name = os.path.basename(file)

document = extractor.start_document_analysis(
    file_source=file,
    features=[TextractFeatures.LAYOUT,TextractFeatures.TABLES],
    client_request_token=doc_id.split('.')[0],
    save_image=False
)

Using the Textractor linearization function we can enrich the extracted content tags and also hide certain page sections like headers and footer and images that may not be useful for data indexing.\
We decide to tag tables, list and title sections to help in effectively identifying and chunking this sections.

In [None]:
from textractor.data.text_linearization_config import TextLinearizationConfig

config = TextLinearizationConfig(
    hide_figure_layout=True,
    title_prefix="<title> ",
     title_suffix="</title> ",
    hide_header_layout=True,
    # section_header_prefix="<header>",
    # section_header_suffix="</header>",
    # table_linearization_format="markdown", 
    table_prefix="<tables><table>",
    table_suffix="</table>",
# table_layout_prefix="<table_layout>",
# table_layout_suffix="</table_layout>",
    # table_column_separator="|",
    # table_tabulate_format = 'github',
    list_layout_prefix="<list>",
    list_layout_suffix="</list>",
    hide_footer_layout=True,
    hide_page_num_layout=True,
    # list_element_prefix="&&&",
    # list_element_suffix="%%%%"
)

print(document.pages[59].get_text(config=config))

The next two cells iterate through the document pages and replace the tables (in plane text) with their csv formats gotten from the Textract Tables feature.\
This helps to wrangle the table effectively when chunking. We decided to use the "|" pipe delimited csv as oppossed to ',' delimited to remove the issue of handling commas within cell contents.

In [None]:
def split_list_items_(items):
    parts = re.split("(<list>|</list>)", items)  
    output = []

    inside_list = False
    list_item = ""

    for p in parts:
        if p == "<list>":
            inside_list = True
            # output.append(p)
            list_item=p
        elif p == "</list>":
            inside_list = False
            list_item += p
            output.append(list_item)
            list_item = "" 
        elif inside_list:
            list_item += p.strip()
        else:
            output.extend(p.split('\n'))
    return output

In [None]:
"""
We filter through each page with a table and replace that free text table with a csv formatted table 
demarcated by <table> xml tags
"""
csv_seperator="|"
document_holder={}
table_page={}
count=0
for ids,page in enumerate(document.pages):

    table_count=len([word for word in page.get_text(config=config).split() if "<tables><table>" in word]) 
    assert table_count==len(page.tables) 
    content=page.get_text(config=config).split("<tables>")
    document_holder[ids]=[]    
    for idx,item in enumerate(content):
        if "<table>" in item:     

            df0=document.tables[count].to_pandas(use_columns=False).to_csv(header=False, index=None,sep=csv_seperator)
            row_count=len([x for x in df0.split("\n") if x]) 
            if row_count>1:
                if not all(value.strip() == '' for value in df0.split("\n")[0].split(csv_seperator)):
                    row_count=1 
            df=pd.read_csv(io.StringIO(df0), sep=csv_seperator, 
                           header=0 if row_count==1 else 1, keep_default_na=False) 
            df.rename(columns=lambda x: '' if x.startswith('Unnamed:') else x, inplace=True) 
            table=df.to_csv(index=None, sep=csv_seperator)
            if ids in table_page:
                table_page[ids].append(table)
            else:
                table_page[ids]=[table]
            pattern = re.compile(r'<table>(.*?)(</table>)', re.DOTALL) 
            data=item
            table_match = re.search(pattern, data)
            table_data = table_match.group(1) if table_match else '' 
            remaining_content = data[table_match.end():] if table_match else data 
            content[idx]=f"<table>{table}</table>" 
            count+=1
            if "<list>" in remaining_content: 
                output=split_list_items_(remaining_content)
                output=[x.strip() for x in output if x.strip()]
                document_holder[ids].extend([content[idx]]+output)           
            else:
                document_holder[ids].extend([content[idx]]+[x.strip() for x in remaining_content.split('\n') if x.strip()]) 
        else:   
            if "<list>" in item and "<table>" not in item:   
                output=split_list_items_(item)
                output=[x.strip() for x in output if x.strip()]
                document_holder[ids].extend(output)
            else:
                document_holder[ids].extend([x.strip() for x in item.split("\n") if x.strip()])



In [None]:
print("\n".join(document_holder[59]))

## Document Chunking
This cell iterates through the document per page and chunks the document in the following manner:
- It looks for the different xml tags to identify the different page sections. 
    - If a table xml tag is found, it checks if there is a sentence before that table ( the heueristics employed here is that the sentence before a table is usually the table header) and use it a stable headers.Split table by rows until desired chunk is achieved.
    append table column names and table headers to chunk.
    - If a list is found, split list by items until desired chunk is achieved. Employ same heuristics as above and append list headers to all list chunk.
    - If title tags is found append title to all chunks in that page.
- Adds an overlap from previous page to keep the connection of information.
- we chunk by number of words

In [None]:
import re
import pandas as pd
from io import StringIO

max_words = 200
chunks = {}
entire_list_chunk=[]
table_header_dict={} 
list_header_dict={}
overlap=50
for page, lines in document_holder.items():
    page_chunks = []
    current_chunk = []
    num_words = 0   
    start_page=0
    TITLE=None
    for line in lines:
        # if page==59:
        if line.strip()!= '':
            if "<title>" in line:   
                TITLE=line.split("<title>")[-1].split("</title>")[0]
                line=TITLE
                TITLE=TITLE.upper().strip()      
                first_page_portion=True
            if len(current_chunk)<2 and entire_list_chunk and start_page==0:    
                cumulative_word_count = 0
                items_within_threshold = 0
                for item in reversed(entire_list_chunk[-1]):
                    words = item.split()
                    if cumulative_word_count + len(words) <= overlap:
                        cumulative_word_count += len(words)
                        items_within_threshold += 1
                        current_chunk.insert(0, item)  
                    else:
                        break
             
                num_words=cumulative_word_count
                start_page=1

            next_num_words = num_words + len(re.findall(r'\w+', line))  

            if  "<table>" not in line and "<list>" not in line:                

                if next_num_words > max_words:
                    if TITLE :
                        if first_page_portion:
                            first_page_portion=False                                            
                        else:
                            current_chunk.insert(0, TITLE.strip())                       

                    page_chunks.append(current_chunk)
                    entire_list_chunk.append(current_chunk)
                    current_chunk = []
                    num_words = 0

                current_chunk.append(line)    
                num_words += len(re.findall(r'\w+', line))


            """
            Goal is to segment out table items and chunks intelligently.
            We chunk the table by rows and for each chunk of the table we append the table column headers
            and table headers if any. This way we preserve the table information across each chunks.
            This will help improve semantic search where all the chunks relating to a table would be in the 
            top k=n response giving the LLM mcomplet information on the table.
            """


            if "<table>" in line:                      
                line_index=lines.index(line)
                if line_index!=0 and "<table>" not in lines[line_index-1] and "<list>" not in lines[line_index-1]: 
                    header=lines[line_index-1].replace("<title>","").replace("</title>","")
                else:
                    header=""            
                if page in table_header_dict:
                    table_header_dict[page].append(header)
                else:
                    table_header_dict[page]=[header]
                table = line.split("<table>")[-1].split("</table>")[0] # get table from demarcators     
                df=pd.read_csv(io.StringIO(table), sep=csv_seperator, keep_default_na=False)
                df.rename(columns=lambda x: '' if x.startswith('Unnamed:') else x, inplace=True)

                table_chunks = []
                curr_chunk = [df.columns.to_list()]  
                words=len(re.findall(r'\w+', str(current_chunk)+" "+str(curr_chunk)))     
                for row in df.itertuples():
                    curr_chunk.append(row)         
                    words+=len(re.findall(r'\w+', str(row)))
                    if words > max_words:                              
                        table_chunks.append("\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]]))                   
                        words = len(re.findall(r'\w+', str(curr_chunk[0]))) 
                        tab_chunk="\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]]) 
                    
                        if header:   
                            if current_chunk and current_chunk[-1].strip().lower()==header.strip().lower():                       
                                current_chunk.pop(-1)
                       
                            if TITLE and TITLE.lower().strip() != header.lower().strip():
                                if first_page_portion:
                                    first_page_portion=False
                                else:
                                    current_chunk.insert(0, TITLE.strip())                          
                            current_chunk.extend([header.strip()+':' if not header.strip().endswith(':') else header.strip() ]+[tab_chunk])
                            page_chunks.append(current_chunk)                            
                            entire_list_chunk.append([header]+table_chunks) 
                        else:
                            if TITLE:
                                if first_page_portion:
                                    first_page_portion=False
                                else:
                                    current_chunk.insert(0, TITLE.strip())               
                            current_chunk.extend([tab_chunk])
                            page_chunks.append(current_chunk)
                            entire_list_chunk.append(table_chunks)

                        num_words=0
                        current_chunk=[]
                        curr_chunk = [curr_chunk[0]]

                if curr_chunk != [df.columns.to_list()] and lines.index(line) == len(lines)-1: 
                    table_chunks.append("\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]]))
                    tab_chunk="\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]])
                    if header: 
                        if current_chunk and current_chunk[-1].strip().lower()==header.strip().lower():
                            current_chunk.pop(-1)  
                        if TITLE and TITLE.lower().strip() != header.lower().strip():
                            if first_page_portion:
                                first_page_portion=False
                            else:
                                current_chunk.insert(0, TITLE.strip())          
                        current_chunk.extend([header.strip()+':' if not header.strip().endswith(':') else header.strip() ]+[tab_chunk])
                        page_chunks.append(current_chunk)
                        entire_list_chunk.append([header]+table_chunks)
                    else:
                        if TITLE:
                            if first_page_portion:
                                first_page_portion=False
                            else:
                                current_chunk.insert(0, TITLE.strip())                     
                        current_chunk.extend([tab_chunk])
                        page_chunks.append(current_chunk)
                        entire_list_chunk.append(table_chunks)
                    num_words=0
                    current_chunk=[]
                elif curr_chunk != [df.columns.to_list()] and lines.index(line) != len(lines)-1: 
                    table_chunks.append("\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]]))
                    tab_chunk="\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]])
                    if header:               
                        if current_chunk and current_chunk[-1].strip().lower()==header.strip().lower():
                            current_chunk.pop(-1) 
                        current_chunk.extend([header.strip()+':' if not header.strip().endswith(':') else header.strip() ]+[tab_chunk])
                    else:
                        current_chunk.extend([tab_chunk])                  
                    num_words=words

            """
            Goal is to segment out list items and chunk intelligently.
            We chunk each list by items in the list and 
            for each list chunk we append the list header to the chunk to preserve the information of the list across chunks.
            This would boost retrieval process where question pertaining to a list will have all list chunks within
            the topK=n responses.
            """

            if "<list>" in line:
                # Get list header which is usually line before list in document
                line_index=lines.index(line)
                if line_index!=0 and "<table>" not in lines[line_index-1] and "<list>" not in lines[line_index-1]:
                    header=lines[line_index-1].replace("<title>","").replace("</title>","")
                else:
                    header=""           
                list_pattern = re.compile(r'<list>(.*?)</list>', re.DOTALL)     
                list_match = re.search(list_pattern, line)
                list_ = list_match.group(1)
                list_lines=list_.split("\n")                

                curr_chunk = []  
                words=len(re.findall(r'\w+', str(current_chunk)))                
                for item in list_lines:
                    curr_chunk.append(item)         
                    words+=len(re.findall(r'\w+', item)) 

                    if words >= max_words: #  
                        words = 0

                        list_chunk="\n".join(curr_chunk)
                        if header:        
                            if current_chunk and current_chunk[-1].strip().lower()==header.strip().lower():                    
                                current_chunk.pop(-1)                        
                            if TITLE and TITLE.lower().strip() != header.lower().strip():
                                if first_page_portion:
                                    first_page_portion=False
                                else:
                                    current_chunk.insert(0, TITLE.strip())                            
                            current_chunk.extend([header.strip()+':' if not header.strip().endswith(':') else header.strip() ]+[list_chunk]) 
                            page_chunks.append(current_chunk)                            
                            entire_list_chunk.append([header]+[list_chunk])
                        else:
                            if TITLE:
                                if first_page_portion:
                                    first_page_portion=False
                                else:
                                    current_chunk.insert(0, TITLE.strip())                     
                            current_chunk.extend([list_chunk])
                            page_chunks.append(current_chunk)
                            entire_list_chunk.append([list_chunk])

                        num_words=0
                        current_chunk=[]
                        curr_chunk = []

                if curr_chunk  and lines.index(line) == len(lines)-1: 
                    list_chunk="\n".join(curr_chunk)
                    if header: 
                        if current_chunk and current_chunk[-1].strip().lower()==header.strip().lower():
                            current_chunk.pop(-1)                       
                        if TITLE and TITLE.lower().strip() != header.lower().strip():
                            if first_page_portion:
                                first_page_portion=False
                            else:
                                current_chunk.insert(0, TITLE.strip())                
                        current_chunk.extend([header.strip()+':' if not header.strip().endswith(':') else header.strip() ]+[list_chunk])
                        page_chunks.append(current_chunk)
                        entire_list_chunk.append([header]+[list_chunk])
                    else:
                        if TITLE:
                            if first_page_portion:
                                first_page_portion=False
                            else:
                                current_chunk.insert(0, TITLE.strip())                
                        current_chunk.extend([list_chunk])
                        page_chunks.append(current_chunk)
                        entire_list_chunk.append([list_chunk])
                    num_words=0
                    current_chunk=[]
                elif curr_chunk and lines.index(line) != len(lines)-1:      
                    list_chunk="\n".join(curr_chunk)
                    if header:               
                        if current_chunk and current_chunk[-1].strip().lower()==header.strip().lower():
                            current_chunk.pop(-1) 
                        current_chunk.extend([header.strip()+':' if not header.strip().endswith(':') else header.strip() ]+[list_chunk])
                    else:
                        current_chunk.extend([list_chunk])                  
                    num_words=words


    if current_chunk:
        if TITLE:
            if first_page_portion:
                first_page_portion=False
            else:
                current_chunk.insert(0, TITLE.strip())
            # current_chunk.insert(0, TITLE.strip())
        page_chunks.append(current_chunk)
        entire_list_chunk.append(current_chunk)
        current_chunk=[]
    chunks[page] = page_chunks

In [None]:
for i, chunk in enumerate(chunks[59], start=1):
    print(f'Chunk {i}:')
    for item in chunk:
        print(item)
    print('\n')

This cell creates a domain in Amazon OpenSearch Service (Provisioned Capacity) and indexes the document chunks. The index mapping includes metadata fields for document names, page, and tables.\
The table field will be populated with the full tables in each page for each chunk in that page. This will serve as additional information to augment each page chunk when retrieved.\
We make use of an embedding model to create the embeddings and index them, the two options provided by this implementation are `all-mini-lmv6` and `titan` if using the former do deploy the sagemaker endpoint and provide the name in the utility function on cell 2 above.\
**Note** Some chunks may exceed the threshold set for chunking in the previous cells due to the way tables are chunked by row which may exceed the threshold. If run into a `token limit exceed` error while creating embeddings (ran into this issue with cohere embedding model on Bedrock), do use a larger sequence length embedding model. \
Replace the **domain_endpoint** variable with the one you created in your account.

In [None]:
# pip install requests-aws4auth
# pip install opensearch-py
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth

"""
This is Amazon opensearch domain that uses IAM for authetication
"""
# Embedding Model
model="all-mini-lm" # other option "titan" #you can also use any other model in Bedrock or SageMaker Jumpstart or HuggingFace

domain_endpoint = "OPENSEARCH SEVICE DOMAIN NAME"
service = 'es'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, "us-east-1", service, session_token=credentials.token)
os_ = OpenSearch(
    hosts = [{'host': domain_endpoint, 'port': 443}],
    http_auth = awsauth,
    use_ssl = True,
    verify_certs = True,
    timeout=120,        
    # http_compress = True, # enables gzip compression for request bodies
    connection_class = RequestsHttpConnection
)

# Sample Opensearch domain index mapping
mapping = {
  'settings': {
    'index': {  
      'knn': True,
      "knn.algo_param.ef_search": 100,            
    }
      },

      'mappings': {  
        'properties': {
          'embedding': {
            'type': 'knn_vector', 
            'dimension':384 if "all-mini-lm" in model else (1536 if "titan" in model else None), #change as per sequence length of Embedding Model
            "method": {
              "name": "hnsw",       
              "space_type": "innerproduct",
              "engine": "nmslib",
              "parameters": {
                 "ef_construction": 256,
                 "m":  48
               }
            }
          },

          'passage_id': {
            'type': 'keyword'
          },

          'passage': {
            'type': 'text'
          },

          'doc_id': {
            'type': 'keyword'
          },
        
          'table': {
            'type': 'text'
          },

        }
      }
    }

domain_index = f"financials"    

if not os_.indices.exists(index=domain_index):        
    os_.indices.create(index=domain_index, body=mapping)
    # Verify that the index has been created
    if os_.indices.exists(index=domain_index):
        print(f"Index {domain_index} created successfully.")
    else:
        print(f"Failed to create index '{domain_index}'.")
else:
    print(f'{domain_index} Index already exists!')

i = 1
import boto3
SAGEMAKER=boto3.client('sagemaker-runtime')
for page, chunkks in chunks.items(): 
    chunk_id = page 
   
    
    for chunk in chunkks:
        passage_chunk="\n".join(chunk)        
        embedding=_get_emb_(passage_chunk, model)     
        table=[]
        if page in table_page:
            for ids,item in enumerate(table_page[page]):
                header=table_header_dict[page][ids]
                if header.strip():
                    header=f"<table_header>{header}</table_header>"
                tsv=f"<table>{item}</table>"
                table.append(header)
                table.append(tsv)
            table="\n".join(table)                
        documentt = { 
            'doc_id':doc_id,
            'passage_id': chunk_id, 
            'passage': passage_chunk, 
            'embedding': embedding,
            'table':table
        }
        try:
            response = os_.index(index=domain_index, body=documentt)
            i += 1
           
            if response["result"] == "created":
                print(f"Document indexed successfully with ID: {response['_id']}")
            else:
                print("Failed to index document.")
        except RequestError as e:
            logging.error(f"Error indexing document to index '{domain_index}': {e}")
        

# RAG

## Bedrock Anthropic LLM Inference

The following cells perform RAG on the newly created index. We run a sample question to test the implementation above

In [None]:
question="GHG reporting metrics and targets for 2021 compared to 2022?"
embedding=_get_emb_(question, model) 
query = {
    'size': 3,
    'query': {
        "knn": {
          "embedding": {
            "vector": embedding,
            "k": 3
          }
        }
      }
    }
response = os_.search(index=domain_index, body=query)
response

In [None]:
res=response['hits']['hits']
score = [str(x['_score']) for x in res] 
passage = [x['_source']['passage'] for x in res] 
page_no = [x['_source']['passage_id'] for x in res] 
doc_name = [x['_source']['doc_id'] for x in res]
tables=[x['_source']['table'] for x in res] 
p = inflect.engine()
## Concatenate passages and tables
passages=""
tab=""
for  ids,text in enumerate(passage):
    passages+=f"<{p.ordinal(ids+1)}_passage>\n{text}\n</{p.ordinal(ids+1)}_passage>\n"
    tab+=f"<{p.ordinal(ids+1)}_passage_table>\n{tables[ids]}\n</{p.ordinal(ids+1)}_passage_table>\n"

In [None]:
prompt_template=f"""You are a helpful, obedient and truthful financial assistance.

<document>
{passages}
</document>          

<additional_information>
{tab}
</additional_information>

<instructions>
When providing your response based on the document:
1. Understand the question to know what is being asked of you.
2. Review the entire document provided and check if it contains relevant information to answer the question. Only pay attention to passages with relevant information.
3. If the document is sufficient to answer the question, provide a comprehensive answer ENTIRELY based on the document provided. DO NOT make up answers not present in the document.
4. If the answer is not available in the document, say so.
</instructions>

Question: {question}"""
prompt=f"\n\nHuman:{prompt_template}\n\nAssistant: Based on the document,"

In [None]:
txt_model='anthropic.claude-v2'
inference_modifier = {'max_tokens_to_sample':300, 
                      "temperature":0.1,
                      # "top_k":250,
                      # "top_p":1,               
                     }
llm = Bedrock(model_id=txt_model, client=bedrock_runtime, model_kwargs = inference_modifier,
              streaming=True,  # Toggle this to turn streaming on or off
              callbacks=[StreamingStdOutCallbackHandler() ])

response = llm(prompt)

## SageMaker JumpStart Mixtral 8x7b Inference

In [None]:
prompt=f"""<s><<SYS>>[INST]
You are a helpful, obedient and truthful assistant. You will only provide answers entirely based on the document provided below.

Here is a document:
####
{passages}
####

Here is additional information:
####
{tab}
####

When providing your response based on the provided document:
1. Understand the question to know what is being asked of you.
2. Review the entire document provided and check if it contains relevant information to answer the question. Only pay attention to sections with relevant information.
3. If the document is sufficient to answer the question, provide a comprehensive answer ENTIRELY based on the document provided. DO NOT make up answers not present in the document.
4. If the answer is not available in the document, say so.<</SYS>>

Question: {question}[/INST]
Answer: According to the document provided,"""
parameters = { "max_new_tokens": 300,       
        "return_full_text": False,
        "temperature": 0.1}
mixtral_instruct.model_kwargs = parameters
print(mixtral_instruct(prompt))

# SUMMARIZATION

## Bedrock Athropic Claude

The following cell does a full document summarization taking advantage of the tagged section to further guide the LLM response. This does not use RAG

In [None]:
doc=""
for k,v in document_holder.items():
    doc+="\n".join(v)
# print(doc)

In [None]:
prompt_template=f"""You are a financial analyst, great at writing summaries of company financial performance.

Here is a document:
<document>
{doc}
</document>

Before responding:
1. Read the document several times to ensure you understand everything.
2. Take notes to identify the relevant information.

Provide a comprehensive summary of the document.
Your summary should include the following if available in the document:
1. Revenue and Profitability
2. Cash Flow
3. Debt and Solvency
4. Market and Industry Trends
5. Future Outlook
6. Risks and Contingencies

Base your entire summary on the information provided in the document."""
prompt=f"\n\nHuman:{prompt_template}\n\nAssistant:"

In [None]:
txt_model='anthropic.claude-v2'
inference_modifier = {'max_tokens_to_sample':1000, 
                      "temperature":0.1,
                      # "top_k":250,
                      # "top_p":1,               
                     }
llm = Bedrock(model_id=txt_model, client=bedrock_runtime, model_kwargs = inference_modifier,
              streaming=True,  # Toggle this to turn streaming on or off
              callbacks=[StreamingStdOutCallbackHandler() ])

response = llm(prompt)

## SageMaker JumpStart Mixtral 8x7b 

In [None]:
map_custom_prompt="""<s><<SYS>>[INST]
You are a financial analyst, great at writing summaries of company financial performance.

Here is a document:
######
`{text}`
######

Before responding:
1. Read the document several times to ensure you understand everything.
2. Take notes to identify the relevant information.<</SYS>>

Provide a comprehensive summary of the document.
Your summary should include the following if available in the document:
1. Revenue and Profitability
2. Cash Flow
3. Debt and Solvency
4. Market and Industry Trends
5. Future Outlook
6. Risks and Contingencies
Base your entire summary on the information provided in the document.[/INST]"""

In [None]:
combine_custom_prompt="""<s><<SYS>>[INST]
You are a financial analyst, great at writing summaries of company financial performance.

Here are multiple summaries of various parts of a documnet:
######
`{text}`
######
<</SYS>>
Provide a coherent final summary that connects your individual summaries perfectly.
Your final summary should follow the same outline:
1. Revenue and Profitability
2. Cash Flow
3. Debt and Solvency
4. Market and Industry Trends
5. Future Outlook
6. Risks and Contingencies
[/INST]"""

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=100,length_function=len)

texts = text_splitter.create_documents([doc])
print(len(texts))

parameters = { "max_new_tokens": 500, }
mixtral_instruct.model_kwargs = parameters

combine_prompt_template = PromptTemplate(
    template=combine_custom_prompt, 
    input_variables=['text']
)

map_prompt_template = PromptTemplate (
    input_variables=['text'],
    template=map_custom_prompt
)

In [None]:

chain = load_summarize_chain(mixtral_instruct, 
                             chain_type="map_reduce", 
                             verbose=True,
                             map_prompt=map_prompt_template,
                            combine_prompt=combine_prompt_template,
                            )
chain.run(texts)