In [3]:
import torch
from transformers import AutoProcessor, AutoModel
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
from pytesseract import Output
import os

def preprocess_pdf_for_vqa(pdf_path: str) :
    if not os.path.exists(pdf_path) :
        raise FileNotFoundError(f"PDF 파일을 찾을 수 없습니다 : {pdf_path}")
    images = convert_from_path(pdf_path)
    processed_pages = []

    for i, image in enumerate(images) :
        ocr_data = pytesseract.image_to_data(image, output_type=Output.DICT)
        words, boxes = [], []
        for j in range(len(ocr_data["text"])) :
            if int(ocr_data["conf"][j]) > 50 and ocr_data["text"][j].strip() != '' :
                words.append(ocr_data["text"][j])
                x, y, w, h = ocr_data["left"][j], ocr_data["top"][j], ocr_data["width"][j], ocr_data["height"][j]
                boxes.append([x, y, x + w, y + h])
        processed_pages.append({"image": image.convert("RGB"), "words": words, "boxes": boxes})
    return processed_pages

processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base")
model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")

pdf_file_path = "./data/GrayAnatomyPDF.pdf"
question = "what is the title of the book?"

try :
    preprocessed_pages = preprocess_pdf_for_vqa(pdf_file_path)
except FileNotFoundError as e :
    print(e)
    exit()

if preprocessed_pages :
    first_page = preprocessed_pages[0]

    encoding = processor(
        first_page["image"],
        question,
        words=first_page["words"],
        boxes=first_page["boxes"],
        return_tensors="pt"
    )

    with torch.no_grad() :
        outputs = model(**encoding)

    last_hidden_state = outputs.last_hidden_state

    print(last_hidden_state)

ValueError: You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True.