In [None]:
!pip install --upgrade numpy --target ./python
!pip install --upgrade numexpr --target ./python

In [None]:
import sys
sys.path.append(r"./python")

import os
import json
from model import *

#根据时间情况修改index和language值
index =  "finance_annual_report_demo_1031"
embedding_endpoint_name = "cohere.embed-multilingual-v3"

embedding_type = 'bedrock' if embedding_endpoint_name.find('titan') or embedding_endpoint_name.find('cohere') else 'sagemaker'
embeddings = init_embeddings_bedrock(embedding_endpoint_name)

In [None]:
import sys
sys.path.append(r"./python")

from tqdm import tqdm
import fitz
from PIL import Image
import numpy as np
import base64
from opensearch_multimodel_dataload import add_multimodel_documents
import re
import io
import time

# model_name = "anthropic.claude-3-5-sonnet-20240620-v1:0"
model_name = "anthropic.claude-3-sonnet-20240229-v1:0"
# model_name = "anthropic.claude-3-haiku-20240307-v1:0"

# model_name = "us.meta.llama3-2-90b-instruct-v1:0"

llm = init_model_bedrock(model_name)

text_max_length = 300
llm_max_size = 1000

def is_json(myjson):
    try:
        json.loads(myjson)
    except ValueError as e:
        return False
    return True

prompt = """
You are a document manager at an financial company and your task is to extract useful information from document images.
<instructions>
1.don't make up content.keep all the content in the documents
2.No preface, just output the document content directly.
3.Output the document in markdown format, and keep the rows and columns aligned for the table.
4.summarize page content to facilitate searching, output the summarize content in<summarize></summarize> tag after page content
</instructions>
"""


files_path = '../docs/finance_annual_report/'

files = os.listdir(files_path)
for file in files:
    file_path = files_path + file
    print(file_path)

    doc = fitz.open(file_path)
    previous_page_content = ''
    
    texts = []
    metadatas = []
    images = []
    
    for i in tqdm(range(doc.page_count)):

        if i < 90 or i > 130:
            continue
        time.sleep(60)
        print('i:',i)
        page = doc.load_page(i)
        pix = page.get_pixmap(dpi=150)

        imgb64 = base64.b64encode(pix.tobytes()).decode("utf-8")
        model_kwargs = {'image': imgb64,'image_type':'jpeg','max_tokens':4096}

        llm.model_kwargs = model_kwargs
        response = llm(prompt)
        response_list = response.split('<summarize>')
        current_page = response_list[0].replace('Table of Contents','').replace('```','')[:-4].strip()
        current_summarize = response_list[1].replace('</summarize>','').replace('```','').strip()
        two_page_content = previous_page_content + ' ' +current_page
        previous_page_content = current_page

        # print('two_page_content:',two_page_content)
        # print('*********')


        content_set = set()
        
        # split and save summarize content
        if len(current_summarize) > text_max_length:
            summarize_sentences = current_summarize.split('\n')
            for summarize_sentence in summarize_sentences:
                content_set.add(summarize_sentence.strip())
        else:
            content_set.add(current_summarize)
        
        content_list = two_page_content.split('\n')
        header = []
        for paragraph in content_list:
            # split and save paragraph content
            if len(paragraph) > text_max_length:
                sentence_list = paragraph.split('.')
                for sentence in sentence_list:
                    content_set.add(sentence)
            else:
                content_set.add(paragraph.strip())
                
            # trans table content to json format
            if paragraph.find('|') >=0:
                if len(header) == 0:
                    header = paragraph.split('|')
                else:
                    content = paragraph.replace('-','').replace(':','').replace('|','')
                    if len(content) > 0:
                        content = paragraph.split('|')
                        line_str = ''
                        for i in range(len(header)):
                            if i < len(content) and len(str(header[i]).strip()) > 0 and len(str(content[i]).strip()) > 0:
                                line_str += (str(header[i]).strip() + ':' + str(content[i]).strip()+ ',')
                            elif i < len(content) and len(str(content[i]).strip()) > 0:
                                line_str += (str(content[i]).strip()+ ',')
                            else:
                                line_str += (str(header[i]).strip()+ ',')
                        content_set.add(line_str[:-1])

            elif paragraph.find('|') < 0 and len(header) > 0:
                header = []
                
        for text in content_set:
            text = text.strip()
            print('text:',text)
            print('--------------')
            texts.append(two_page_content)
            metadata = {}
            metadata['sentence'] = text[:text_max_length] if len(text) > text_max_length else text
            metadata['sources'] = file.split('/')[-1]
            metadata['page'] = str(i-1) + ' to ' + str(i) if i > 0 else str(i)
            metadatas.append(metadata)
            
        if len(texts) > 0:
            if embedding_type == 'bedrock':
                text_embeddings = embeddings.embed_documents([metadata['sentence'] for metadata in metadatas])
            else:
                text_embeddings = embeddings.embed_documents([metadata['sentence'] for metadata in metadatas],chunk_size=10)

            print('texts len:',len(texts))
            print('metadatas len:',len(metadatas))
            print('embeddings len:',len(text_embeddings))
            print('images len:',len(images))
            print('begin to save in vectore store')

            add_multimodel_documents(
                index,
                texts=texts,
                embeddings=text_embeddings,
                metadatas=metadatas,
                images=images
            )
        print('finish save in vectore store:',index)
        texts = []
        metadatas = []
        images = []

