# 🍩 Donut Inference on FUNSD
Run OCR-free key-value extraction on form images using a pretrained Donut model.

## 📦 Install Required Libraries

In [None]:
!pip install transformers accelerate opencv-python Pillow

## 🧠 Load Donut Pretrained Model

In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch

processor = DonutProcessor.from_pretrained('naver-clova-ix/donut-base-finetuned-docvqa')
model = VisionEncoderDecoderModel.from_pretrained('naver-clova-ix/donut-base-finetuned-docvqa')
model.eval()

## 🖼️ Load and Preprocess FUNSD Image

In [None]:
from PIL import Image
import requests

# Replace with your local FUNSD image path
image_path = 'data/images/0000971160.png'
image = Image.open(image_path).convert("RGB")

pixel_values = processor(images=image, return_tensors="pt").pixel_values

## ❓ Ask a Question and Run Inference

In [None]:
task_prompt = '<s_docvqa><s_question>What are the key fields and their values?<s_answer>'
inputs = processor.tokenizer(task_prompt, return_tensors='pt')

with torch.no_grad():
    outputs = model.generate(pixel_values, decoder_input_ids=inputs.input_ids, max_length=512)

decoded = processor.batch_decode(outputs, skip_special_tokens=True)[0]
print("🔍 Answer:", decoded)

## 📊 Interpret Output

In [None]:
# Donut answers are unstructured; they require regex or parsing
# Example post-processing: extracting key-value pairs from the output text
import re

kv_pairs = re.findall(r'"(.*?)"\s*:\s*"(.*?)"', decoded)
for key, value in kv_pairs:
    print(f"{key.strip()} --> {value.strip()}")