In [43]:
import os
import mimetypes
from openai import AzureOpenAI
from PIL import Image
from io import BytesIO
import base64
from docx import Document
import fitz  # PyMuPDF

llm = AzureOpenAI(
    api_key="cb7f8503371e48b5",
    api_version="2024-08-01-preview",
    azure_endpoint="https://hexavarsity-secureapi.azurewebsites.net/api/azureai",
    azure_deployment="gpt-4o"
)

In [35]:
def encode_image_to_base64(image: Image.Image) -> str:
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{img_str}"

In [49]:
def summarize_image_with_vision(image: Image.Image) -> str:
    encoded_image = encode_image_to_base64(image)
    response = llm.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a document analysis assistant. Explain what's in this image."},
            {"role": "user", "content": [{"type": "image_url", "image_url": {"url": encoded_image}}]}
        ],
        max_tokens=500
    )
    return response.choices[0].message.content.strip()

In [27]:
def extract_images_from_pdf(pdf_path: str):
    images = []
    doc = fitz.open(pdf_path)
    for page in doc:
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            img_bytes = base_image["image"]
            image = Image.open(BytesIO(img_bytes))
            images.append(image)
    return images

In [28]:
def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text.strip()

def extract_text_from_docx(docx_path: str) -> str:
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs]).strip()

In [29]:
def extract_images_from_docx(docx_path: str):
    images = []
    doc = Document(docx_path)
    for rel in doc.part._rels:
        rel = doc.part._rels[rel]
        if "image" in rel.target_ref:
            image_data = rel.target_part.blob
            image = Image.open(BytesIO(image_data))
            images.append(image)
    return images


In [30]:
def is_image_file(file_path: str) -> bool:
    mime = mimetypes.guess_type(file_path)[0]
    return mime and mime.startswith("image")

def process_file(file_path: str):
    if is_image_file(file_path):
        print("Detected image file.")
        img = Image.open(file_path)
        return summarize_image_with_vision(img)

    images = []
    extracted_text = ""

    if file_path.endswith(".pdf"):
        print("Detected PDF file.")
        extracted_text = extract_text_from_pdf(file_path)
        images = extract_images_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        print("Detected DOCX file.")
        extracted_text = extract_text_from_docx(file_path)
        images = extract_images_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type")

    summaries = [[], []]

    if extracted_text:
        print("Extracted text...")
        summaries[0].append(extracted_text)

    for idx, image in enumerate(images):
        print(f"Summarizing image {idx+1}...")
        summary = summarize_image_with_vision(image)
        summaries[1].append(summary)

    return summaries


In [50]:
file_path = "C:\\Users\\Asus\\Downloads\\Ticket.pdf"  # Can also be image or docx
result = process_file(file_path)

Detected PDF file.
Extracted text...
Summarizing image 1...
Summarizing image 2...
Summarizing image 3...
Summarizing image 4...
Summarizing image 5...
Summarizing image 6...
Summarizing image 7...
Summarizing image 8...
Summarizing image 9...


In [64]:
print(result[1][8])

The image is an advertisement for a credit card by RBL Bank. The background features a red gradient design. On the left side, there is a depiction of a credit card presented at an angle, with the RBL Bank logo displayed on it alongside "RuPay SELECT" branding. 

The text in the center reads "Platform kis taraf ayega?" which is a Hindi phrase translating to "Which side will the platform come?" in English. Below the text, there is a button labeled "APPLY NOW" encouraging viewers to apply for the credit card.

On the right side, the advertisement highlights the benefits of the card, stating "Get 40x Benefits." Also in this section, there's the RBL Bank logo. The terms and conditions are noted with "T&C Apply" in small print at the bottom.
