In [1]:
# ── cell / file: split_tokens_and_fix_labels.py ────────────────────────────
from pathlib import Path
import json, re

# ─────────────────── CONFIG ────────────────────────────────────────────────
SRC_DIR = Path(r"/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-RESIZE/qhuy_data")     # <── output of script #1
DST_DIR = Path(r"/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-RETAG/qhuy_data")           # <── will be created
HEADER_PATTERNS = [
    r"^Đơn vị bán hàng:?$",
    r"^Mã số thuế:?$",
    r"^Ký hiệu:?$",
    r"^Số:?$",
    r"^Tên đơn vị:?$",
    r"^Hình thức thanh toán:?$",
    r"^Cộng tiền hàng:?$",
    r"^Tiền thuế GTGT:?$",
    r"^Tổng tiền thanh toán:?$",
]
HEADER_RE = re.compile("|".join(HEADER_PATTERNS), re.IGNORECASE)

DST_DIR.mkdir(exist_ok=True)

def split_token(word):
    # simple whitespace split; keeps punctuation with the token it belongs to
    return word.split()

def propagate_label(label, n):
    if label == "O" or n == 1:
        return [label] * n
    tag = label.split("-", 1)[-1]
    return ["B-" + tag] + ["I-" + tag] * (n - 1)

for p in SRC_DIR.glob("*.json"):
    data = json.loads(p.read_text(encoding="utf-8"))

    new_words, new_bboxes, new_labels = [], [], []

    for word, bbox, label in zip(data["words"], data["bboxes"], data["labels"]):
        # 1) If header phrase → label = "O"
        if HEADER_RE.match(word):
            pieces = split_token(word)
            new_words.extend(pieces)
            new_bboxes.extend([bbox] * len(pieces))
            new_labels.extend(["O"] * len(pieces))
            continue

        # 2) Normal entity token: split + propagate BIO
        pieces = split_token(word)
        new_words.extend(pieces)
        new_bboxes.extend([bbox] * len(pieces))
        new_labels.extend(propagate_label(label, len(pieces)))

    # 3) Write updated record
    data["words"]  = new_words
    data["bboxes"] = new_bboxes
    data["labels"] = new_labels

    out = DST_DIR / p.name
    out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
    print("✓ tokenised:", p.name)

print(f"Done.  Files written to {DST_DIR}")


✓ tokenised: 01GTKT0_0003263.json
✓ tokenised: 01GTKT0_0003273.json
✓ tokenised: 01GTKT0_0003722.json
✓ tokenised: 1C22THS_00000212.json
✓ tokenised: 1C22THS_00000402.json
✓ tokenised: 1C22THS_00000432.json
✓ tokenised: 1C22THS_00000677.json
✓ tokenised: 1C22THS_00000905.json
✓ tokenised: 1C22THS_00001048.json
✓ tokenised: 1C22THS_00001068.json
✓ tokenised: 1C22THS_00001070.json
✓ tokenised: 1C22THS_00001080.json
✓ tokenised: 1C22THS_00001123.json
✓ tokenised: 1C22THS_00001272.json
✓ tokenised: 1C22THS_00001506.json
✓ tokenised: 1C22THS_00001643.json
✓ tokenised: 1C22THS_00001977.json
✓ tokenised: 1C22THS_00002027.json
✓ tokenised: 1C22THS_00002067.json
✓ tokenised: 1C22THS_00002197.json
✓ tokenised: 1C22THS_00002287.json
✓ tokenised: 1C22THS_00002328.json
✓ tokenised: 1C22THS_00002367.json
✓ tokenised: 1C22THS_00002478.json
✓ tokenised: 1C22THS_00002526.json
✓ tokenised: 1C22TLT_00000613.json
✓ tokenised: 1C22TMS_00100367.json
✓ tokenised: 1C22TMS_00100369.json
✓ tokenised: 1C22TYY_00