In [None]:
!pip install azure
!pip install base64
!pip install openai
!pip install re
!pip install fitz
!pip install typing
!pip install PyMuPDF
!pip install pillow

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import os

azure_doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

client = DocumentAnalysisClient(azure_doc_intelligence_endpoint, AzureKeyCredential(doc_intelligence_key))

input_filepath = ""#insert path of page_1.pdf here
with open(input_filepath, "rb") as f:
    poller = client.begin_analyze_document("prebuilt-read", document=f)
    result = poller.result()

full_text = ""
for page in result.pages:
    for line in page.lines:
        full_text += line.content + "\n"

print(full_text)

In [None]:
import base64
import fitz

input_filepath = ""#insert path of page_1.pdf here
doc = fitz.open(input_filepath)
page = doc[0]
pix = page.get_pixmap(dpi=150)
pix.save("page1.png")

with open("page1.png", "rb") as image_file:
    base64_image = base64.b64encode(image_file.read()).decode("utf-8")

In [None]:
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)


prompt = """You are an intelligent document analyst.
Given this document page, identify all images, figures or tables.
For each figure:
1. Provide a detailed description of what the figure or image shows.
2. For the location just provide what the next few lines of text say, be precise (at least 2 sentences)!. If you cant find sentences afterwards (if the figure is at the end of the page) provide the previous sentences.

Just return the structured list for each image/figure/table based on what you found ALWAYS use this structure:
[Description:...
Location:...]
."""


response = client.chat.completions.create(
    model=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
            ]
        }
    ],
    max_tokens=1000
)

result = response.choices[0].message.content

print(response.choices[0].message.content)

In [None]:

prompt = f"""You are a document editor.

Given a full document and a block of sentences that appear shortly after a figure, your job is to find where in the document these sentences appear or match most closely, and insert the following figure description **just before** them.
Respond with the new version of the document that has the description inserted in the correct place.
Insert the figure description and add **Figure description** before it.
---

Figure Description and Location:
{result}

---

Document:
{full_text}
."""

response = client.chat.completions.create(
    model=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt}
            ]
        }
    ],
    max_tokens=1000
)

result = response.choices[0].message.content

print(response.choices[0].message.content)

In [None]:
#cut out figure from image
import requests
import time
from PIL import Image
from difflib import SequenceMatcher
import os

AZURE_CV_ENDPOINT = os.getenv("AZURE_CV_ENDPOINT")
AZURE_CV_KEY = os.getenv("AZURE_CV_KEY")
IMAGE_PATH = "page1.png"

# The location of the figure in the document
#to make generic extract the location from the result from first analysis
gpt_location = """
1) This paper provides an in-depth analysis of major object detectors in both categories single and two stage detectors. Furthermore, we take historic look to the evolution of these methods.
2) We present a detailed evaluation of the landmark backbone architectures and lightweight models. We could not find any paper which provides a broad overview of both these topics.
"""

with open(IMAGE_PATH, "rb") as f:
    img_data = f.read()

headers = {
    "Ocp-Apim-Subscription-Key": AZURE_CV_KEY,
    "Content-Type": "application/octet-stream"
}

ocr_url = AZURE_CV_ENDPOINT + "vision/v3.2/read/analyze"
response = requests.post(ocr_url, headers=headers, data=img_data)
operation_url = response.headers["Operation-Location"]

while True:
    result = requests.get(operation_url, headers={"Ocp-Apim-Subscription-Key": AZURE_CV_KEY}).json()
    if result["status"] == "succeeded":
        break
    time.sleep(1)

#Find best-matching OCR line
lines = []
for read_result in result["analyzeResult"]["readResults"]:
    for line in read_result["lines"]:
        lines.append(line)

def similarity(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

best_line = max(lines, key=lambda l: similarity(gpt_location, l["text"]))
coords = best_line["boundingBox"]

x_vals = coords[::2]
y_vals = coords[1::2]
left = min(x_vals)
right = max(x_vals)
bottom = min(y_vals)
top = max(0, bottom - 700)

image = Image.open(IMAGE_PATH)
cropped = image.crop((left, top, right, bottom))
cropped.save("figure_from_azure_ocr.png")
