In [2]:
from pathlib import Path, PurePath
import json, re, glob
from PIL import Image

def scale(b, W, H): return [round(b[0]/W*1000), round(b[1]/H*1000),
                             round(b[2]/W*1000), round(b[3]/H*1000)]

def load_anno(path_json):
    raw = json.loads(Path(path_json).read_text(encoding="utf-8"))
    spans = {(s,e):lab for s,e,lab in raw["label"]}
    return raw, spans

def convert_one(raw_json, ocr_json, image_path=None):
    raw, spans = load_anno(raw_json)
    ocr = json.loads(Path(ocr_json).read_text())
    W, H = (2480, 3508)

    if image_path is not None:
        try:
            W, H = Image.open(image_path).size
        except:
            pass

    words, bboxes, labels = [], [], []
    cursor = 0
    for item in ocr:
        for tok in item["text"].split():
            m = re.search(re.escape(tok), raw["text"][cursor:], flags=re.I)
            lab = "O"
            if m:
                s = cursor + m.start()
                e = s + len(tok)
                for (a, b), t in spans.items():
                    if a <= s < b:
                        lab = "B-" + t if s == a else "I-" + t
                        break
                cursor = e
            words.append(tok)
            bboxes.append(scale(item["bbox"], W, H))
            labels.append(lab)

    result = dict(id=raw["id"], words=words, bboxes=bboxes, labels=labels)
    if image_path is not None:
        result["image_path"] = image_path

    return result


# -------- batch convert ----------
IN_DIR   = Path("/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/COMBINECODE/samples")              # chứa *.text.json & *.ocr.json
OUT_DIR  = Path("/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/COMBINECODE/converted"); OUT_DIR.mkdir(exist_ok=True)

for anno_path in IN_DIR.glob("*_text.json"):
    ocr_path = anno_path.with_name(anno_path.stem.replace("_text","_ocr")+".json")
    if not ocr_path.exists(): continue
    conv = convert_one(anno_path, ocr_path)
    out = OUT_DIR / f"{conv['id']}_layoutlmv3.json"
    out.write_text(json.dumps(conv, ensure_ascii=False, indent=2))
    print("✔", out.name)


✔ 457_layoutlmv3.json
