In [6]:
from PIL import Image
import urllib.request
from io import BytesIO
import re

import torch

image_url = 'https://raw.githubusercontent.com/katanaml/sparrow/main/ml-data/docs/input/invoices/processed/images/invoice_10.jpg'

with urllib.request.urlopen(image_url) as url:
    image = Image.open(BytesIO(url.read()))

# image.show()

In [None]:
from huggingface_hub import login

login("")

In [3]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

processor = DonutProcessor.from_pretrained("katanaml-org/invoices-donut-model-v1")
model = VisionEncoderDecoderModel.from_pretrained("katanaml-org/invoices-donut-model-v1")

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

VisionEncoderDecoderModel(
  (encoder): DonutSwinModel(
    (embeddings): DonutSwinEmbeddings(
      (patch_embeddings): DonutSwinPatchEmbeddings(
        (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DonutSwinEncoder(
      (layers): ModuleList(
        (0): DonutSwinStage(
          (blocks): ModuleList(
            (0): DonutSwinLayer(
              (layernorm_before): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
              (attention): DonutSwinAttention(
                (self): DonutSwinSelfAttention(
                  (query): Linear(in_features=128, out_features=128, bias=True)
                  (key): Linear(in_features=128, out_features=128, bias=True)
                  (value): Linear(in_features=128, out_features=128, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )

In [7]:
def process_document(image):
    # prepare encoder inputs
    pixel_values = processor(image, return_tensors="pt").pixel_values

    # prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

    # generate answer
    outputs = model.generate(
        pixel_values.to(device),
        decoder_input_ids=decoder_input_ids.to(device),
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    # postprocess
    sequence = processor.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token

    return processor.token2json(sequence)

In [8]:
result = process_document(image)
print(result)

{'header': {'invoice_no': '49565075', 'invoice_date': '10/28/2019', 'seller': 'Kane-Morqan 968 Carr Mission Apt. 320', 'client': '1445 Haas Vladurt Suite 454 Michaelhaven, LA 3252', 'seller_tax_id': '964-95-3813', 'client_tax_id': '909-75-5482', 'iban': 'GB73WC135232646970614'}, 'items': [{'item_desc': 'Wine Saver', 'item_qty': '1,00', 'item_net_price': '8,00', 'item_net_worth': '8,00', 'item_vat': '10%', 'item_gross_worth': '8,80'}, {'item_desc': 'Lotto "Congratulations" Hand Painteed and Decorated Wine Glass NIB', 'item_qty': '1,00', 'item_net_price': '20,00', 'item_vat': '10%', 'item_gross_worth': '12,00'}], 'summary': {'total_net_worth': '$87,94', 'total_vat': '$8,79', 'total_gross_worth': '$96,73'}}
