In [None]:
pip install --force-reinstall amazon-textract-textractor==1.7.1

In [439]:
import os
from PIL import Image
import pandas as pd
import re
import json
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures

import boto3
from botocore.config import Config

# Create the bedrock runtime to invoke LLM
from botocore.config import Config
config = Config(
    read_timeout=600, #this timeout determines the maximum time (secs) allowed for the client to wait for data to be received from the server. 
    retries = dict(
        max_attempts = 5 ## maximum number of retry attempts that will be made on a single request
    )
)
import boto3
bedrock_runtime = boto3.client(service_name='bedrock-runtime',region_name='us-east-1',config=config)

In [None]:
def bedrock_streemer(response,model):
    stream = response.get('body')
    answer = ""
    i = 1
    if stream:
        for event in stream:
            chunk = event.get('chunk')
            if chunk:
                chunk_obj = json.loads(chunk.get('bytes').decode())
                if 'claude' in model.lower():
                    text = chunk_obj['completion']
                if 'titan' in model.lower():
                    text = chunk_obj["outputText"]
                if 'cohere' in model.lower():
                    text = chunk_obj["generations"][0]['text']
                if 'llama2' in model.lower():
                    text = chunk_obj["generation"]
                answer+=text
                print(text, end="")             
                i+=1
    return answer

In [None]:
class LineIterator:
    """
    A helper class for parsing the byte stream input. 
    
    The output of the model will be in the following format:
    ```
    b'{"outputs": [" a"]}\n'
    b'{"outputs": [" challenging"]}\n'
    b'{"outputs": [" problem"]}\n'
    ...
    ```
    
    While usually each PayloadPart event from the event stream will contain a byte array 
    with a full json, this is not guaranteed and some of the json objects may be split across
    PayloadPart events. For example:
    ```
    {'PayloadPart': {'Bytes': b'{"outputs": '}}
    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
    ```
    
    This class accounts for this by concatenating bytes written via the 'write' function
    and then exposing a method which will return lines (ending with a '\n' character) within
    the buffer via the 'scan_lines' function. It maintains the position of the last read 
    position to ensure that previous bytes are not exposed again. 
    """
    
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()
            if line and line[-1] == ord('\n'):
                self.read_pos += len(line)
                return line[:-1]
            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise
            if 'PayloadPart' not in chunk:
                print('Unknown event type:' + chunk)
                continue
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk['PayloadPart']['Bytes'])
            
def jumpstart_streemer(output):
    answer=""
    stop_token = '<|endoftext|>'
    event_stream = output['Body']
    start_json = b'{'
    for line in LineIterator(event_stream):
        if line != b'' and start_json in line:
            data = json.loads(line[line.find(start_json):].decode('utf-8'))
            if data['token']['text'] != stop_token:
                answer+=data['token']['text']
                print(data['token']['text'], end="")  
    return answer

In [None]:
!aws s3 cp amzn-20221231.pdf s3://BUCKET-NAME

In [440]:
extractor = Textractor(region_name="us-east-1")

In [443]:
file="s3://BUCKET/amzn-20221231.pdf"
doc_id=file_name = os.path.basename(file)

document = extractor.start_document_analysis(
    file_source=file,
    features=[TextractFeatures.LAYOUT,TextractFeatures.TABLES],
    client_request_token=doc_id.split('.')[0],
    save_image=False
)

In [522]:
from textractor.data.text_linearization_config import TextLinearizationConfig

config = TextLinearizationConfig(
    hide_figure_layout=True,
    title_prefix="<title> ",
     title_suffix="</title> ",
    hide_header_layout=True,
    # section_header_prefix="<header>",
    # section_header_suffix="</header>",
    # table_linearization_format="markdown", 
    table_prefix="<tables><table>",
    table_suffix="</table>",
# table_layout_prefix="<table_layout>",
# table_layout_suffix="</table_layout>",
    # table_column_separator="|",
    # table_tabulate_format = 'github',
    list_layout_prefix="<list>",
    list_layout_suffix="</list>",
    hide_footer_layout=True,
    hide_page_num_layout=True,
    # list_element_prefix="&&&",
    # list_element_suffix="%%%%"
)

print(document.pages[84].get_text(config=config))







Intangible Assets 

Acquired identifiable intangible assets are valued primarily by using discounted cash flows. These assets are included within "Other assets" on our consolidated balance sheets and consist of the following (in millions): 



<tables><table>	December 31, 						
	2021 			2022 			
	Acquired Intangibles, Gross (1) 	Accumulated Amortization (1) 	Acquired Intangibles, Net 	Acquired Intangibles, Gross (1) 	Accumulated Amortization (1) 	Acquired Intangibles, Net 	Weighted Average Life Remaining 
Finite-lived intangible assets (2): 							
Marketing-related 	$ 2,286 	$ (548) 	$ 1,738 	$ 2,407 	$ (601) 	$ 1,806 	18.6 
Contract-based 	2,327 	(565) 	1,762 	3,661 	(813) 	2,848 	12.8 
Technology- and content-based 	976 	(610) 	366 	883 	(643) 	240 	3.2 
Customer-related 	197 	(103) 	94 	184 	(128) 	56 	2.2 
Total finite-lived intangible assets 	$ 5,786 	$ (1,826) 	$ 3,960 	$ 7,135 	$ (2,185) 	$ 4,950 	14.4 
IPR&D and other (3) 	$ 1,147 		$ 1,147 	$ 1,147 		$ 1,147 	
Total ac

In [513]:
def split_list_items_(items):
    parts = re.split("(<list>|</list>)", items)  
    output = []

    inside_list = False
    list_item = ""

    for p in parts:
        if p == "<list>":
            inside_list = True
            # output.append(p)
            list_item=p
        elif p == "</list>":
            inside_list = False
            list_item += p
            output.append(list_item)
            list_item = "" 
        elif inside_list:
            list_item += p.strip()
        else:
            output.extend(p.split('\n'))
    return output

In [514]:
"""
We filter through each page with a table and replace that free text table with a csv formatted table 
demarcated by <table> xml tags
"""
csv_seperator="|"
document_holder={}
table_page={}
count=0
for ids,page in enumerate(document.pages):

    table_count=len([word for word in page.get_text(config=config).split() if "<tables><table>" in word]) # get the number of table in the extracted document page by header we set earlier
    assert table_count==len(page.tables) # check that number of tables per page is same as tables extracted by textract TABLE feature
    content=page.get_text(config=config).split("<tables>")
    document_holder[ids]=[]    
    for idx,item in enumerate(content):
        if "<table>" in item:     

            df0=document.tables[count].to_pandas(use_columns=False).to_csv(header=False, index=None,sep=csv_seperator)
            row_count=len([x for x in df0.split("\n") if x]) #Check the number of rows in the parsed table to determine how to read the table headers. if table row count is 1 then headers is obviously at 0 else headers may or may not be at 0
            if row_count>1:
                if not all(value.strip() == '' for value in df0.split("\n")[0].split(csv_seperator)): #Check if the first row in the csv is empty headers due to the way Textract parses the csv at times (with empty column headers while true headers are at index=1)
                    row_count=1 # if true column headers sit at row index 0 set row_count=1 else true column headers sit at row index 1
            df=pd.read_csv(io.StringIO(df0), sep=csv_seperator, 
                           header=0 if row_count==1 else 1, keep_default_na=False) # read table with appropiate column headers
            df.rename(columns=lambda x: '' if x.startswith('Unnamed:') else x, inplace=True) # replace pandas "unnamed" placeholder for empty column names with empty string
            table=df.to_csv(index=None, sep=csv_seperator)
            if ids in table_page:
                table_page[ids].append(table)
            else:
                table_page[ids]=[table]
            pattern = re.compile(r'<table>(.*?)(</table>)', re.DOTALL)  ## scoop the table csv string from other non-tabular text
            data=item
            table_match = re.search(pattern, data)
            table_data = table_match.group(1) if table_match else ''  ## table
            remaining_content = data[table_match.end():] if table_match else data ## non-tabular text
            content[idx]=f"<table>{table}</table>" ## attach xml tags to differentiate table from other text
            count+=1
            if "<list>" in remaining_content: # keep list item as a single item in the python list
                output=split_list_items_(remaining_content)
                output=[x.strip() for x in output if x.strip()]
                document_holder[ids].extend([content[idx]]+output)           
            else:
                document_holder[ids].extend([content[idx]]+[x.strip() for x in remaining_content.split('\n') if x.strip()]) # split other text by new line to be independent items in the python list.
        else:   
            if "<list>" in item and "<table>" not in item:   
                output=split_list_items_(item)
                output=[x.strip() for x in output if x.strip()]
                document_holder[ids].extend(output)
            else:
                document_holder[ids].extend([x.strip() for x in item.split("\n") if x.strip()])



In [515]:
print("\n".join(document_holder[84]))

Intangible Assets
Acquired identifiable intangible assets are valued primarily by using discounted cash flows. These assets are included within "Other assets" on our consolidated balance sheets and consist of the following (in millions):
<table>|December 31, ||||||
|2021 |||2022 |||
|Acquired Intangibles, Gross (1) |Accumulated Amortization (1) |Acquired Intangibles, Net |Acquired Intangibles, Gross (1) |Accumulated Amortization (1) |Acquired Intangibles, Net |Weighted Average Life Remaining 
Finite-lived intangible assets (2): |||||||
Marketing-related |$ 2,286 |$ (548) |$ 1,738 |$ 2,407 |$ (601) |$ 1,806 |18.6 
Contract-based |2,327 |(565) |1,762 |3,661 |(813) |2,848 |12.8 
Technology- and content-based |976 |(610) |366 |883 |(643) |240 |3.2 
Customer-related |197 |(103) |94 |184 |(128) |56 |2.2 
Total finite-lived intangible assets |$ 5,786 |$ (1,826) |$ 3,960 |$ 7,135 |$ (2,185) |$ 4,950 |14.4 
IPR&D and other (3) |$ 1,147 ||$ 1,147 |$ 1,147 ||$ 1,147 |
Total acquired intangibles |

In [516]:
import re
import pandas as pd
from io import StringIO

max_words = 200
chunks = {}
entire_list_chunk=[]
table_header_dict={} ## hold table headers to be used for chunk indexing
list_header_dict={}
overlap=50
for page, lines in document_holder.items():
    page_chunks = []
    current_chunk = []
    num_words = 0   
    start_page=0
    TITLE=None
    for line in lines:
        if line.strip()!= '':
            # COmment this code block out if you dont want to include topic chunkimg
            if "<title>" in line:   
                TITLE=line.split("<title>")[-1].split("</title>")[0]
                line=TITLE
                TITLE=TITLE.upper()               
            

            if len(current_chunk)<2 and entire_list_chunk and start_page==0:    
                cumulative_word_count = 0
                items_within_threshold = 0
                
                # Iterate through the list from the bottom up and add up to 50 words from previous page
                for item in reversed(entire_list_chunk[-1]):
                    words = item.split()
                    if cumulative_word_count + len(words) <= overlap:
                        cumulative_word_count += len(words)
                        items_within_threshold += 1
                        current_chunk.insert(0, item)  # Insert at the beginning to maintain order
                    else:
                        break
                first_page_portion=True
                num_words=cumulative_word_count
                start_page=1

            next_num_words = num_words + len(re.findall(r'\w+', line))  

            if  "<table>" not in line and "<list>" not in line:
                
                
                if next_num_words > max_words:
                    if TITLE :
                        if first_page_portion:
                            first_page_portion=False
                        else:
                            current_chunk.insert(0, TITLE.strip())
                        
                    page_chunks.append(current_chunk)
                    entire_list_chunk.append(current_chunk)
                    current_chunk = []
                    num_words = 0

                current_chunk.append(line)    
                num_words += len(re.findall(r'\w+', line))
                
            """
            Goal is to segment out table items and chunks intelligently.
            We chunk the table by rows and for each chunk of the table we append the table column headers
            and table headers if any. This way we preserve the table information across each chunks.
            This will help improve semantic search where all the chunks relating to a table would be in the 
            top k=n response giving the LLM mcomplet information on the table.
            """


            if "<table>" in line:
                # Get table header which is usually line before table in document
                line_index=lines.index(line)
                if line_index!=0 and "<table>" not in lines[line_index-1] and "<list>" not in lines[line_index-1]: #Check if table is first item on the page, then they wont be a header (header may be included it table) and also if table is the the last item in the list
                    header=lines[line_index-1]
                else:
                    header=""
                
                if page in table_header_dict:
                    table_header_dict[page].append(header)
                else:
                    table_header_dict[page]=[header]

                table = line.split("<table>")[-1].split("</table>")[0] # get table from demarcators     
                df=pd.read_csv(io.StringIO(table), sep=csv_seperator, keep_default_na=False)
                df.rename(columns=lambda x: '' if x.startswith('Unnamed:') else x, inplace=True)

                table_chunks = []
                curr_chunk = [df.columns.to_list()] #start current chunk with table column names    
                words=len(re.findall(r'\w+', str(current_chunk)+" "+str(curr_chunk)))  
                # Iterate through the rows in the table
                for row in df.itertuples():
                    curr_chunk.append(row)         
                    words+=len(re.findall(r'\w+', str(row)))#len(re.findall(r'\w+', " ".join([str(x) for x in row])))

                    if words > max_words:
                        # print(words,page, end="\n")                        
                        table_chunks.append("\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]])) #join chunk lines together to for a csv                   
                        words = len(re.findall(r'\w+', str(curr_chunk[0]))) # set word count to word length of column header names
                        tab_chunk="\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]]) #join chunk lines together to for a csv
                        
                        if header: #If header  attach header to table                         
                            if current_chunk and current_chunk[-1]==header: #check if header is in the chunk and remove to avoid duplicacy of header in chunk                        
                                current_chunk.pop(-1)
                            
                            header=header+"\n" if not header.strip().endswith('\n') else header  # add new line char to seperate from table  
                            if TITLE:
                                current_chunk.insert(0, TITLE.strip())
                            current_chunk.extend([header]+[tab_chunk]) #if current_chunk and header!=current_chunk[-1] else current_chunk.extend([tab_chunk])
                            page_chunks.append(current_chunk)                            
                            entire_list_chunk.append([header]+table_chunks) #if current_chunk and header!=current_chunk[-1] else entire_list_chunk.append(table_chunks)
                        else:
                            if TITLE:
                                current_chunk.insert(0, TITLE.strip())
                            current_chunk.extend([tab_chunk])
                            page_chunks.append(current_chunk)
                            entire_list_chunk.append(table_chunks)
                       
                        num_words=0
                        current_chunk=[]
                        curr_chunk = [curr_chunk[0]]

                if curr_chunk != [df.columns.to_list()] and lines.index(line) == len(lines)-1: #if table chunk still remaining and table is last item in page append as last chunk
                    table_chunks.append("\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]]))
                    tab_chunk="\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]])
                    if header: 
                        if current_chunk and current_chunk[-1]==header: #check if header is in the chunk and remove to avoid duplicacy of header in chunk
                            current_chunk.pop(-1)
                        header=header+"\n" if not header.strip().endswith('\n') else header 
                        if TITLE:
                            current_chunk.insert(0, TITLE.strip())
                        current_chunk.extend([header]+[tab_chunk])
                        page_chunks.append(current_chunk)
                        entire_list_chunk.append([header]+table_chunks)
                    else:
                        if TITLE:
                            current_chunk.insert(0, TITLE.strip())
                        current_chunk.extend([tab_chunk])
                        page_chunks.append(current_chunk)
                        entire_list_chunk.append(table_chunks)
                    num_words=0
                    current_chunk=[]
                elif curr_chunk != [df.columns.to_list()] and lines.index(line) != len(lines)-1: #if table is not last item in page and max word threshold is not reached, send no next loop
                    table_chunks.append("\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]]))
                    tab_chunk="\n".join(["|".join(str(x) for x in curr_chunk[0])] + ["|".join(str(x) for x in r) for r in curr_chunk[1:]])
                    if header:               
                        if current_chunk and current_chunk[-1]==header: #check if header is in the chunk and remove to avoid duplicacy of header in chunk
                            current_chunk.pop(-1)
                        header=header+"\n" if not header.strip().endswith('\n') else header 
                        current_chunk.extend([header]+[tab_chunk])
                    else:
                        current_chunk.extend([tab_chunk])                  
                    num_words=words
            
            """
            Goal is to segment out list items and chunk intelligently.
            We chunk each list by items in the list and 
            for each list chunk we append the list header to the chunk to preserve the information of the list across chunks.
            This would boost retrieval process where question pertaining to a list will have all list chunks within
            the topK=n responses.
            """
            
            if "<list>" in line:
                # Get list header which is usually line before list in document
                line_index=lines.index(line)
                if line_index!=0 and "<table>" not in lines[line_index-1] and "<list>" not in lines[line_index-1]: #Check if table or list is the previous item on the page, then they wont be a header
                    header=lines[line_index-1]
                else:
                    header=""           
                list_pattern = re.compile(r'<list>(.*?)</list>', re.DOTALL)   ## Grab all list contents within the list xml tags        
                list_match = re.search(list_pattern, line)
                list_ = list_match.group(1)
                list_lines=list_.split("\n")                

                curr_chunk = []  
                words=len(re.findall(r'\w+', str(current_chunk)))  #start word count from any existing chunk
                # Iterate through the items in the list
                for item in list_lines:
                    curr_chunk.append(item)         
                    words+=len(re.findall(r'\w+', item)) 

                    if words >= max_words: #  
                        words = 0 # restart word count to zero
                        
                        list_chunk="\n".join(curr_chunk)
                        if header: # If header  attach header to table                         
                            if current_chunk and current_chunk[-1]==header: #check if header is in the chunk and remove to avoid duplicacy of header in chunk                        
                                current_chunk.pop(-1)  
                            header=header+"\n" if not header.strip().endswith('\n') else header                            
                            if TITLE:
                                current_chunk.insert(0, TITLE.strip())                          
                            current_chunk.extend([header]+[list_chunk]) 
                            page_chunks.append(current_chunk)                            
                            entire_list_chunk.append([header]+[list_chunk])
                        else:
                            if TITLE:
                                current_chunk.insert(0, TITLE.strip())
                            current_chunk.extend([list_chunk])
                            page_chunks.append(current_chunk)
                            entire_list_chunk.append([list_chunk])
                       
                        num_words=0
                        current_chunk=[]
                        curr_chunk = []

                if curr_chunk  and lines.index(line) == len(lines)-1: #if list chunk still remaining and list is last item in page append as last chunk
                    list_chunk="\n".join(curr_chunk)
                    if header: 
                        if current_chunk and current_chunk[-1]==header: #check if header is in the chunk and remove to avoid duplicacy of header in chunk
                            current_chunk.pop(-1)
                        header=header+"\n" if not header.strip().endswith('\n') else header
                        if TITLE:
                            current_chunk.insert(0, TITLE.strip())
                        current_chunk.extend([header]+[list_chunk])
                        page_chunks.append(current_chunk)
                        entire_list_chunk.append([header]+[list_chunk])
                    else:
                        if TITLE:
                            current_chunk.insert(0, TITLE.strip())
                        current_chunk.extend([list_chunk])
                        page_chunks.append(current_chunk)
                        entire_list_chunk.append([list_chunk])
                    num_words=0
                    current_chunk=[]
                elif curr_chunk and lines.index(line) != len(lines)-1: #if list is not last item in page and max word threshold is not reached, send to next loop          
                    list_chunk="\n".join(curr_chunk)
                    if header:               
                        if current_chunk and current_chunk[-1]==header: #check if header is in the chunk and remove to avoid duplicacy of header in chunk
                            current_chunk.pop(-1)
                        header=header+"\n" if not header.strip().endswith('\n') else header                         
                        current_chunk.extend([header]+[list_chunk])
                    else:
                        current_chunk.extend([list_chunk])                  
                    num_words=words
            
            
    if current_chunk:
        if TITLE:
            current_chunk.insert(0, TITLE.strip())
        page_chunks.append(current_chunk)
        entire_list_chunk.append(current_chunk)
        current_chunk=[]
    chunks[page] = page_chunks

In [520]:
for i, chunk in enumerate(chunks[84], start=1):
    print(f'Chunk {i}:')
    for item in chunk:
        print(item)
    print('\n')

Chunk 1:
NOTE 5 - ACQUISITIONS, GOODWILL, AND ACQUIRED INTANGIBLE ASSETS
(1) Primarily includes changes in foreign exchange rates.
Intangible Assets
Acquired identifiable intangible assets are valued primarily by using discounted cash flows. These assets are included within "Other assets" on our consolidated balance sheets and consist of the following (in millions):

|December 31, ||||||
0||2021 |||2022 |||
1||Acquired Intangibles, Gross (1) |Accumulated Amortization (1) |Acquired Intangibles, Net |Acquired Intangibles, Gross (1) |Accumulated Amortization (1) |Acquired Intangibles, Net |Weighted Average Life Remaining 
2|Finite-lived intangible assets (2): |||||||
3|Marketing-related |$ 2,286 |$ (548) |$ 1,738 |$ 2,407 |$ (601) |$ 1,806 |18.6 
4|Contract-based |2,327 |(565) |1,762 |3,661 |(813) |2,848 |12.8 
5|Technology- and content-based |976 |(610) |366 |883 |(643) |240 |3.2 
6|Customer-related |197 |(103) |94 |184 |(128) |56 |2.2 


Chunk 2:
Acquired identifiable intangible assets 

In [None]:
# pip install requests-aws4auth
# pip install opensearch-py
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth

"""
This is Amazon opensearch domain that uses IAM for authetication
"""
domain_endpoint = "AMAZON OPENSEARCH PROVISIONED DOMAIN ENDPOINT"
service = 'es'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, "us-east-1", service, session_token=credentials.token)
os_ = OpenSearch(
    hosts = [{'host': domain_endpoint, 'port': 443}],
    http_auth = awsauth,
    use_ssl = True,
    verify_certs = True,
    timeout=120,        
    # http_compress = True, # enables gzip compression for request bodies
    connection_class = RequestsHttpConnection
)

mapping = {
  'settings': {
    'index': {  
      'knn': True,
      "knn.algo_param.ef_search": 64,            
    }
      },

      'mappings': {  
        'properties': {
          'embedding': {
            'type': 'knn_vector', 
            'dimension': 384, #change as per sequence length of Embedding Model
            "method": {
              "name": "hnsw",       
              "space_type": "l2",
              "engine": "lucene",
              "parameters": {
                 "ef_construction": 72,
                 "m":  72
               }
            }
          },

          'passage_id': {
            'type': 'keyword'
          },

          'passage': {
            'type': 'text'
          },

          'doc_id': {
            'type': 'keyword'
          },
        
          'table': {
            'type': 'text'
          },

        }
      }
    }
# st.write(mapping)
domain_index = f"experiment9"    

if not os_.indices.exists(index=domain_index):        
    os_.indices.create(index=domain_index, body=mapping)
    # Verify that the index has been created
    if os_.indices.exists(index=domain_index):
        print(f"Index {domain_index} created successfully.")
    else:
        print(f"Failed to create index '{domain_index}'.")
else:
    print(f'{domain_index} Index already exists!')

i = 1
import boto3
SAGEMAKER=boto3.client('sagemaker-runtime')
for page, chunkks in chunks.items(): # Iterate through dict with chunk page# and content
    chunk_id = page # take care of multiple chunks in same page (*) is used as delimiter
   
    
    for chunk in chunkks:
        passage_chunk="\n".join(chunk)
        payload = {'text_inputs': [passage_chunk]}
        payload = json.dumps(payload).encode('utf-8')

        response = SAGEMAKER.invoke_endpoint(EndpointName="jumpstart-dft-hf-textembedding-all-minilm-l6-v2-2x", 
                                                    ContentType='application/json',  
                                                    Body=payload)

        model_predictions = json.loads(response['Body'].read())
        embedding = model_predictions['embedding'][0]
        table=[]
        if page in table_page:
            for ids,item in enumerate(table_page[page]):
                header=table_header_dict[page][ids]
                if header.strip():
                    header=f"<table_header>{header}</table_header>"
                tsv=f"<table>{item}</table>"
                table.append(header)
                table.append(tsv)
            table="\n".join(table)                
        documentt = { 
            'doc_id':doc_id, #doc name
            'passage_id': chunk_id, #page number
            'passage': passage_chunk, 
            'embedding': embedding,
            'table':table #table
        }
        try:
            response = os_.index(index=domain_index, body=documentt)
            i += 1
            # Check the response to see if the indexing was successful
            if response["result"] == "created":
                print(f"Document indexed successfully with ID: {response['_id']}")
            else:
                print("Failed to index document.")
        except RequestError as e:
            logging.error(f"Error indexing document to index '{domain_index}': {e}")
        

In [None]:
question="operating income loss in 2022 for North america"
response= SAGEMAKER.invoke_endpoint(EndpointName="ALL-MINI-LM2 SageMaker Model Endpoint", 
                                                    ContentType='application/json',  
                                                    Body=json.dumps({'text_inputs': [question]}))
model_predictions = json.loads(response['Body'].read())
embedding = model_predictions['embedding'][0]

query = {
    'size': 3,
    'query': {
        "knn": {
          "embedding": {
            "vector": embedding,
            "k": 3
          }
        }
      }
    }
response = os_.search(index=domain_index, body=query)
response

In [465]:
res=response['hits']['hits']
score = [str(x['_score']) for x in res]  #retrieval score
passage = [x['_source']['passage'] for x in res] #retrieved passages
page_no = [x['_source']['passage_id'] for x in res] #doc page number of chunks
doc_name = [x['_source']['doc_id'] for x in res] # doc names
tables=[x['_source']['table'] for x in res] # tables in the corresponding chunk doc pages

## Concatenate passages and tables
passages=""
tab=""
for  ids,text in enumerate(passage):
    passages+=f"<{p.ordinal(ids+1)}_passage>\n{text}\n</{p.ordinal(ids+1)}_passage>\n"
    tab+=f"<{p.ordinal(ids+1)}_passage_table>\n{tables[ids]}\n</{p.ordinal(ids+1)}_passage_table>\n"

## Bedrock Anthropic LLM Inference

In [471]:
prompt_template=f"""You are a helpful, obedient and truthful financial assistance.

<document>
{passages}
</document>          

<additional_information>
{tab}
</additional_information>

<instructions>
When providing your response based on the document:
1. Understand the question to know what is being asked of you.
2. Review the entire document provided and check if it contains relevant information to answer the question. Only pay attention to passages with relevant information.
3. If the document is sufficient to answer the question, provide a comprehensive answer ENTIRELY based on the document provided. DO NOT make up answers not present in the document.
4. If the answer is not available in the document, say so.
</instructions>

Question: {question}"""
prompt=f"\n\nHuman:{prompt_template}\n\nAssistant: Based on the document,"

In [None]:
model='anthropic.claude-v2'
prompts={
  "prompt": prompt,
  "max_tokens_to_sample": 300,
  "temperature": 0.1,
  # "top_k": 250,
  # "top_p": 1,  
   
}
prompts=json.dumps(prompts)
response = bedrock_runtime.invoke_model_with_response_stream(body=prompts, modelId=model, accept="application/json",  contentType="application/json")
output = bedrock_streemer(response,model)

## Mixtral 8x7b Inference

In [479]:
prompt=f"""<s><<SYS>>[INST]
You are a helpful, obedient and truthful assistant. You will only provide answers entirely based on the document provided below.

Here is a document:
####
{passages}
####

Here is additional information:
####
{tab}
####

When providing your response based on the provided document:
1. Understand the question to know what is being asked of you.
2. Review the entire document provided and check if it contains relevant information to answer the question. Only pay attention to sections with relevant information.
3. If the document is sufficient to answer the question, provide a comprehensive answer ENTIRELY based on the document provided. DO NOT make up answers not present in the document.
4. If the answer is not available in the document, say so.<</SYS>>

Question: {question}[/INST]
Answer: According to the document provided,"""

In [None]:
payload = {
       "inputs": prompt,
        "parameters": {"max_new_tokens": 300, 
                       # "top_p": params['top_p'] if params['top_p']<1 else 0.99 ,
                       # "temperature": 0.1,
                       "return_full_text": False,},
    "stream": True
    } 
output=SAGEMAKER.invoke_endpoint_with_response_stream(Body=json.dumps(payload), EndpointName="MIXTRAL ENDPOINT SAGEAMKER",ContentType="application/json")
answer=jumpstart_streemer(output)
