<a href="https://colab.research.google.com/github/xbetik/DiusAIInvoiceExtractionApp/blob/master/DiusAI_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch --index-url https://download.pytorch.org/whl/cu121
!pip install "transformers>=4.44" "accelerate>=0.33" "huggingface_hub>=0.23" \
             "sentencepiece>=0.1.99" "protobuf>=3.20.2,<5" \
             "Pillow>=10.2" "pdf2image>=1.17" "safetensors>=0.4.2"

In [None]:
!apt-get install poppler-utils

In [None]:
import os, json
from typing import Dict, Any
from PIL import Image
from transformers import pipeline

def normalize_amount(x: str) -> float | None:
    if not x: return None
    s = x.replace("\u00A0", " ").strip().replace(" ", "")
    if "," in s and "." in s:
        if s.rfind(",") > s.rfind("."):
            s = s.replace(".", "").replace(",", ".")
        else:
            s = s.replace(",", "")
    elif "," in s:
        s = s.replace(",", ".")
    try:
        return float(s.strip("EUR€$£CZKPLNCHFHUF"))
    except:
        return None

def normalize_record(record: Dict[str, Any]) -> Dict[str, Any]:
    supplier = record.get("supplier", {}) or {}
    customer = record.get("customer", {}) or {}
    out = {
        "document_id": record.get("document_id"),
        "source_type": record.get("source_type"),
        "supplier": {
            "name": supplier.get("name"),
            "vat_id": supplier.get("vat_id"),
            "ico": supplier.get("ico"),
            "address": supplier.get("address"),
            "iban": supplier.get("iban"),
        },
        "customer": {
            "name": customer.get("name"),
            "vat_id": customer.get("vat_id"),
            "ico": customer.get("ico"),
            "address": customer.get("address"),
        },
        "invoice_number": record.get("invoice_number"),
        "issue_date": record.get("issue_date"),
        "due_date": record.get("due_date"),
        "delivery_date": record.get("delivery_date"),
        "currency": record.get("currency") or "EUR",
        "total_amount": normalize_amount(record.get("total_amount")),
        "vat_amount": normalize_amount(record.get("vat_amount")),
        "subtotal_amount": normalize_amount(record.get("subtotal_amount")),
        "payment_method": record.get("payment_method"),
        "lines": record.get("lines") if isinstance(record.get("lines"), list) else [],
        "notes": record.get("notes"),
        "confidence": float(record.get("confidence") or 0.0),
        "missing_fields": [],
        "status": "incomplete"
    }
    missing = []
    if not out["supplier"].get("name"): missing.append("supplier.name")
    if not out["customer"].get("name"): missing.append("customer.name")
    if not out["invoice_number"]: missing.append("invoice_number")
    if not out["issue_date"]: missing.append("issue_date")
    if out["total_amount"] in (None, "", []): missing.append("total_amount")
    out["missing_fields"] = missing
    out["status"] = "complete" if not missing else "incomplete"
    if not record.get("confidence"):
        out["confidence"] = max(0.3, 0.95 - 0.05*len(missing))
    return out

In [None]:
docvqa = pipeline(
    "document-question-answering",
    model="naver-clova-ix/donut-base-finetuned-docvqa",
    tokenizer="naver-clova-ix/donut-base-finetuned-docvqa",
    device_map="auto"   # GPU if available
)

In [None]:
# from google.colab import files
# uploaded = files.upload()

# invoice_path = list(uploaded.keys())[0]  # take first uploaded file
# print("Uploaded:", invoice_path)

# # If PDF, take the first page
# if invoice_path.lower().endswith(".pdf"):
#     from pdf2image import convert_from_path
#     pages = convert_from_path(invoice_path, dpi=200)
#     image = pages[0]
# else:
#     image = Image.open(invoice_path).convert("RGB")

In [None]:
def ask(question: str) -> str:
    res = docvqa(image=image, question=question)
    if isinstance(res, list) and res:
        return res[0].get("answer", "")
    return res.get("answer", "")

record = {
    "document_id": os.path.basename(invoice_path),
    "source_type": "pdf" if invoice_path.lower().endswith(".pdf") else "image",
    "supplier": {
        "name": ask("What is the supplier name?"),
        "address": ask("What is the supplier address?"),
    },
    "customer": {
        "name": ask("What is the customer name?"),
        "address": ask("What is the customer address?"),
    },
    "vat_amount": ask("What is the VAT amount?"),
    "total_amount": ask("What is the total amount to pay?"),
    "lines": [],
    "notes": None,
    "confidence": 0.0,
}

normalized = normalize_record(record)

In [None]:
with open("invoice_ml.json", "w", encoding="utf-8") as f:
    json.dump(normalized, f, indent=2, ensure_ascii=False)

print(json.dumps(normalized, indent=2, ensure_ascii=False))
files.download("invoice_ml.json")