In [3]:
# ── cell / file: retag_again.py ─────────────────────────────────────────────
from pathlib import Path
import json, re

# ─────────────────── CONFIG ────────────────────────────────────────────────
SRC_DIR = Path(r"/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-RESIZE/nghia_data")   # input  (id-fixed + bbox-norm)
DST_DIR = Path(r"/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-RETAG/nghia_data")       # output (will be created)
DST_DIR.mkdir(exist_ok=True)

HEADER_PATTERNS = [
    r"^Đơn vị bán hàng.*:$",
    r"^Mã số thuế.*:$",
    r"^Ký hiệu.*:$",
    r"^Số.*:$",
    r"^Tên đơn vị.*:$",
    r"^Hình thức thanh toán.*:$",
    r"^Cộng tiền hàng.*:$",
    r"^Tiền thuế GTGT.*:$",
    r"^Tổng tiền thanh toán.*:$",
]
HEADER_RE = re.compile("|".join(HEADER_PATTERNS), re.IGNORECASE)

def split_token(token: str):
    """Split on whitespace only; keeps punctuation with its word."""
    return token.split()

def expand_label(original: str, n: int):
    """
    Ensure first piece is B-TAG, remaining I-TAG.
    Works whether original was B-TAG or I-TAG.
    """
    if original == "O" or n == 1:
        return [original] * n
    tag = original.split("-", 1)[-1]
    return ["B-"+tag] + ["I-"+tag]*(n-1)

# --------------------------------------------------------------------------
for path in SRC_DIR.glob("*.json"):
    rec = json.loads(path.read_text(encoding="utf-8"))

    new_words, new_bboxes, new_labels = [], [], []

    for word, bbox, label in zip(rec["words"], rec["bboxes"], rec["labels"]):

        if HEADER_RE.match(word.strip()):
            pieces = split_token(word)
            new_words.extend(pieces)
            new_bboxes.extend([bbox] * len(pieces))
            new_labels.extend(["O"] * len(pieces))
            continue

        pieces = split_token(word)
        new_words.extend(pieces)
        new_bboxes.extend([bbox] * len(pieces))
        new_labels.extend(expand_label(label, len(pieces)))

    rec["words"]  = new_words
    rec["bboxes"] = new_bboxes
    rec["labels"] = new_labels

    (DST_DIR / path.name).write_text(json.dumps(rec, ensure_ascii=False, indent=2),
                                     encoding="utf-8")
    print("✓ retagged:", path.name)

print(f"Done.  Files written to {DST_DIR}")


✓ retagged: 1C23THS_00000681.json
✓ retagged: 1C23THS_00000686.json
✓ retagged: 1C23THS_00000699.json
✓ retagged: 1C23THS_00000843.json
✓ retagged: 1C23THS_00000949.json
✓ retagged: 1C23THS_00001000.json
✓ retagged: 1C23THS_00001132.json
✓ retagged: 1C23THS_00001280.json
✓ retagged: 1C23THS_00001421.json
✓ retagged: 1C23THS_00001544.json
✓ retagged: 1C23THS_00001563.json
✓ retagged: 1C23THS_00001880.json
✓ retagged: 1C23THS_00001989.json
✓ retagged: 1C23THS_00002192.json
✓ retagged: 1C23THS_00002219.json
✓ retagged: 1C23THS_00002311.json
✓ retagged: 1C23THS_00002410.json
✓ retagged: 1C23THS_00002564.json
✓ retagged: 1C23THS_00002834.json
✓ retagged: 1C23THS_00002856.json
✓ retagged: 1C23THS_00002902.json
✓ retagged: 1C23THS_00002914.json
✓ retagged: 1C24THS_00000197.json
✓ retagged: 1C24THS_00000215.json
✓ retagged: 1C24THS_00000393.json
✓ retagged: 1C24THS_00000627.json
✓ retagged: 1C24THS_00000772.json
✓ retagged: 1C24THS_00000808.json
✓ retagged: 1C24THS_00000831.json
✓ retagged: 1C