# 0 · Environment & base paths — run first 🔵

In [9]:
# ----------------- 0. Imports & constants -----------------
from pathlib import Path
import os, cv2, numpy as np, torch, re, dateparser, matplotlib.pyplot as plt
from tqdm import tqdm
from albumentations import (Compose, Rotate, RandomBrightnessContrast,
                            Perspective, MotionBlur, )
from albumentations.pytorch import ToTensorV2
from datasets import load_dataset
from transformers import (DonutProcessor, VisionEncoderDecoderModel,
                          Trainer, TrainingArguments)

# ----- curriculum folders (Phase-1: 3 × 64 invoices) -----
PHASES = [
    dict(img_dir='/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_mhuy/converted_pngs',
         label_file='/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_mhuy/mhuy_donut_rename.jsonl'),
    dict(img_dir='/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_mcuong/converted_pngs',
         label_file='/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_mcuong/mcuong_donut_rename.jsonl'),
    dict(img_dir='/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_tnghia/converted_pngs',
         label_file='/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_tnghia/tnghia_donut_rename.jsonl'),
]

BASE_MODEL = "naver-clova-ix/donut-base"   # Donut v1.0 base
MAX_LEN    = 600                           # ≤ 600 tokens in Phase-1
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = DonutProcessor.from_pretrained(BASE_MODEL)
model     = VisionEncoderDecoderModel.from_pretrained(BASE_MODEL).to(DEVICE)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    2560,
    1920
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "torch_dtype":

# 1 · Pre-processing helpers 🔵

In [10]:
# deskew • clahe • resize ------------------------------------------------------
def deskew(img, limit=5):
    g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(g, (9, 9), 0)
    th = cv2.threshold(blur, 0, 255,
                       cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    coords = np.column_stack(np.where(th > 0))
    angle  = cv2.minAreaRect(coords)[-1]
    angle  = -(90 + angle) if angle < -45 else -angle
    if abs(angle) < limit:
        (h, w) = img.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        img = cv2.warpAffine(img, M, (w, h),
                             flags=cv2.INTER_CUBIC,
                             borderMode=cv2.BORDER_REPLICATE)
    return img, angle

def clahe(img):
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    cl = cv2.createCLAHE(2.0, (8,8)).apply(l)
    return cv2.cvtColor(cv2.merge((cl, a, b)), cv2.COLOR_LAB2BGR)

def resize_long(img, target=1100):
    h, w  = img.shape[:2]
    scale = target / max(h, w)
    if scale == 1:
        return img
    return cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)

def preprocess_one(path, show=False):
    img, ang = deskew(cv2.imread(str(path)))
    img      = clahe(img)
    img      = resize_long(img)
    if show:
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(f"After preprocess (rotate {ang:.2f}°)")
        plt.axis('off'); plt.show()
    out = path.with_suffix('.clean.jpg')
    cv2.imwrite(str(out), img, [cv2.IMWRITE_JPEG_QUALITY, 95])
    return out, ang


# 2 · Optional offline augmentation helper 🔵



# 3 · Dataset builder (used by every phase) 🔵

In [11]:
def build_dataset(img_dir: str, label_file: str,
                  max_len: int = MAX_LEN, augment: bool = False):
    def proc_example(ex):
        # load → optional aug → pixel-values tensor
        img = cv2.imread(f"{img_dir}/{ex['file_name']}")
        if augment:
            img = AUG(image=img)["image"].permute(1, 2, 0).numpy()
        px  = processor.image_processor(img, return_tensors="pt").pixel_values[0]

        ids = processor.tokenizer(
            ex["label"], add_special_tokens=False,
            max_length=max_len, truncation=True
        ).input_ids
        return {"pixel_values": px, "labels": ids}

    ds = load_dataset("json", data_files=label_file)["train"]
    return ds.map(proc_example, remove_columns=ds.column_names).with_format("torch")


# 4 · Phase-1 curriculum training loop 🔵

In [None]:
from datetime import datetime
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()          # keep console clean

for step_idx, phase in enumerate(PHASES, 1):
    print(f"\n🟩 Phase-1 · step {step_idx} | {phase['img_dir']} | "
          f"{datetime.now().strftime('%H:%M:%S')}")
    ds        = build_dataset(phase["img_dir"], phase["label_file"])
    out_dir   = f"./donut_phase1_step{step_idx}"

    args = TrainingArguments(
        output_dir                   = out_dir,
        per_device_train_batch_size  = 1,
        gradient_accumulation_steps  = 8,
        num_train_epochs             = 3,
        learning_rate                = 2e-5,
        warmup_steps                 = 200,
        fp16                         = True,
        logging_steps                = 50,
        save_total_limit             = 2,
        report_to                    = "none",
        resume_from_checkpoint       = os.path.isdir(out_dir),
    )
    Trainer(model=model, args=args, train_dataset=ds).train()
    model.save_pretrained(f"{out_dir}/checkpoint_final")  # explicit save



🟩 Phase-1 · step 1 | /mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_mhuy/converted_pngs | 14:02:34


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

## After this loop finishes, your final Phase-1 weights are in:

./donut_phase1_step3/checkpoint_final


# 5 · Utility & inference helpers 🔵


In [None]:
# ───── regex/amount/date helpers (unchanged) ───────────────────────────
MST = re.compile(r'\b\d{10}(?:\d{3})?\b')

def normalize_amount(txt):
    txt = txt.replace(',', '.').replace(' ', '')
    m   = re.findall(r'[\d\.]+', txt)
    return m[0].replace('.', '') if m else txt

def fix_taxcode(code):
    m = MST.search(code or '')
    return m.group(0) if m else code

def parse_date(txt):
    dt = dateparser.parse(txt, settings={'DATE_ORDER':'DMY'})
    return dt.strftime('%Y-%m-%d') if dt else txt

# ───── single-invoice prediction ───────────────────────────────────────
@torch.no_grad()
def predict_one(img_path: str, max_len: int = 512):
    pv  = processor.image_processor(img_path,
                                    return_tensors='pt').pixel_values.to(DEVICE)
    out = model.generate(pv, max_length=max_len)
    return processor.batch_decode(out, skip_special_tokens=False)[0]

# demo (comment out until training finished)
# print(predict_one("sample_invoice.jpg")[:400])
