In [1]:
import json
from transformers import DonutProcessor

processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
# Ưu tiên giữ các field quan trọng giai đoạn 1
FIELD_PRIORITY = ["COMPANY_NAME", "SELLER_TAX_CODE", "TOTAL_AMOUNT", "TOTAL_PAYMENT"]

# Mapping sang tag ngắn hơn nếu muốn
TAG_MAP = {
    "COMPANY_NAME": "company",
    "SELLER_TAX_CODE": "tax",
    "TOTAL_AMOUNT": "subtotal",
    "TOTAL_PAYMENT": "total"
}

def extract_fields(example, field_order):
    text = example["text"]
    fields = {}
    for start, end, label in example["label"]:
        if label in field_order and label not in fields:
            val = text[start:end].strip()
            fields[label] = val
    return fields

def to_donut_label(fields, field_order):
    label = "<s_invoices>"
    for key in field_order:
        if key in fields:
            tag = TAG_MAP.get(key, key.lower())
            label += f"<{tag}>{fields[key]}</{tag}>"
    label += "<e_invoices>"
    return label

def token_length(label_str):
    return len(processor.tokenizer(label_str, add_special_tokens=False).input_ids)


In [3]:
def convert_to_donut_jsonl(input_file, output_file, max_token_len=600):
    with open(input_file, encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    result = []
    for i, ex in enumerate(data):
        temp_fields = FIELD_PRIORITY.copy()
        while temp_fields:
            fields = extract_fields(ex, temp_fields)
            label_str = to_donut_label(fields, temp_fields)
            length = token_length(label_str)
            if length <= max_token_len:
                break
            temp_fields = temp_fields[:-1]  # Cắt bớt field cuối nếu quá dài

        result.append({
            "file_name": f"invoice_{i+1:05}.jpg",  # sửa tên nếu cần
            "label": label_str
        })

    with open(output_file, "w", encoding="utf-8") as f:
        for row in result:
            json.dump(row, f, ensure_ascii=False)
            f.write("\n")
    print(f"✅ Saved {len(result)} samples to {output_file}")


In [4]:
# File gốc là annotated JSONL như bạn gửi ở trên
input_path = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_mhuy/mhuy_labelled.jsonl"     # sửa nếu khác
output_path = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_mhuy/mhuy_donut.jsonl"  # sửa nếu khác

convert_to_donut_jsonl(input_path, output_path)


✅ Saved 65 samples to /mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Donuts/data_mhuy/mhuy_donut.jsonl
