In [None]:
!pip install boto3==1.34.131

In [None]:
import pdfplumber

document = "<Your PDF data path goes here>"
pdf_obj = pdfplumber.open(document)
page = pdf_obj.pages[1]

In [None]:
PDF_TEXT = page.extract_text(
    layout=True, 
)
print(PDF_TEXT)

In [None]:
image_path = './output_image.png'

with open(image_path, "rb") as f:
    IMAGE = f.read()

In [None]:
system_text = "You will be given an image of a PDF page and the text extracted from that PDF using a parser. Your task is to extract all contents from the image, analyze them, and create a structured markdown output while comparing the extracted text with the provided PDF text. Maintain the original order and language of the content."


In [None]:
input_text_pre = f"""
First, you will be provided with the text extracted from the PDF:

<filename>
{{FILENAME}}
</filename>

<page_number>
{{PAGE_NUMBER}}
</page_number>

<pdf_text>
{{PDF_TEXT}}
</pdf_text>

Next, you will be given an image.
"""

input_text_post = f"""
Follow these steps to complete the task:

1. Analyze the image content and identify the types of content present (graphs, tables, text).

2. For each type of content in the image:
   a. If it's a graph/diagram/figure:
      - Provide a short description of the graph/diagram/figure
      - Do not provide a fake link
   b. If it's a table:
      - Extract all cells correctly
      - Compare the extracted cell values with the corresponding section in the PDF text
      - Use the PDF text as the correct version if there are discrepancies
      - Format the table in markdown
      - Ensure each cell value is aligned with its column
   c. If it's text and not cell values of (b):
      - Extract the text
      - Compare the extracted text with the corresponding section in the PDF text
      - Use the PDF text as the correct version if there are discrepancies
      - Ensure there are no duplicates in both (b) and (c)

3. Maintain the original order of content as it appears in the image.

4. Use the **ORIGINAL MAIN LANGUAGE** in the PDF page, **DONOT** do translation.

5. Format the final content in clean, structured markdown.

6. Ensure that all content is included and accurately represented.

7. **DONOT** add any prefix before <result>.

Present your final result within <result> tags. The content inside these tags should be the clean, structured markdown representation of the PDF page content, suitable for use as RAG input.

Return <filename> and <page_number> info in a JSON format.

Remember to include all relevant information from both the PDF text and the image content, prioritizing accuracy and maintaining the original order and language of the content.
"""

In [None]:
import boto3

bedrock_client = boto3.client(service_name='bedrock-runtime')

model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
temperature = 0.5
top_k = 3

inference_config = {"temperature": temperature}
additional_model_fields = {"top_k": top_k}
system_prompts = [{"text": system_text}]
message = {
    "role": "user",
    "content": [
        {
            "text": input_text_pre
        },
        {
            "image": {
                "format": 'png',
                "source": {
                    "bytes": IMAGE
                }
            }
        },
        {
            "text": input_text_post
        }
    ]
}
messages = []
messages.append(message)
    
response = bedrock_client.converse(
    modelId=model_id,
    messages=messages,
    system=system_prompts,
    inferenceConfig=inference_config,
    additionalModelRequestFields=additional_model_fields
)

In [None]:
print(response['output']['message']['content'][0]['text'])