In [6]:
# ── cell / file: split_tokens_and_fix_labels.py ────────────────────────────
from pathlib import Path
import json, re

# ─────────────────── CONFIG ────────────────────────────────────────────────
SRC_DIR = Path(r"/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-RESIZE/mcuong_data")     # <── output of script #1
DST_DIR = Path(r"/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-RETAG/mcuong_data")           # <── will be created
HEADER_PATTERNS = [
    r"^Đơn vị bán hàng:?$",
    r"^Mã số thuế:?$",
    r"^Ký hiệu:?$",
    r"^Số:?$",
    r"^Tên đơn vị:?$",
    r"^Hình thức thanh toán:?$",
    r"^Cộng tiền hàng:?$",
    r"^Tiền thuế GTGT:?$",
    r"^Tổng tiền thanh toán:?$",
]
HEADER_RE = re.compile("|".join(HEADER_PATTERNS), re.IGNORECASE)

DST_DIR.mkdir(exist_ok=True)

def split_token(word):
    # simple whitespace split; keeps punctuation with the token it belongs to
    return word.split()

def propagate_label(label, n):
    if label == "O" or n == 1:
        return [label] * n
    tag = label.split("-", 1)[-1]
    return ["B-" + tag] + ["I-" + tag] * (n - 1)

for p in SRC_DIR.glob("*.json"):
    data = json.loads(p.read_text(encoding="utf-8"))

    new_words, new_bboxes, new_labels = [], [], []

    for word, bbox, label in zip(data["words"], data["bboxes"], data["labels"]):
        # 1) If header phrase → label = "O"
        if HEADER_RE.match(word):
            pieces = split_token(word)
            new_words.extend(pieces)
            new_bboxes.extend([bbox] * len(pieces))
            new_labels.extend(["O"] * len(pieces))
            continue

        # 2) Normal entity token: split + propagate BIO
        pieces = split_token(word)
        new_words.extend(pieces)
        new_bboxes.extend([bbox] * len(pieces))
        new_labels.extend(propagate_label(label, len(pieces)))

    # 3) Write updated record
    data["words"]  = new_words
    data["bboxes"] = new_bboxes
    data["labels"] = new_labels

    out = DST_DIR / p.name
    out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
    print("✓ tokenised:", p.name)

print(f"Done.  Files written to {DST_DIR}")


✓ tokenised: 1C24TDT_00002847.json
✓ tokenised: 1C24TDT_00002853.json
✓ tokenised: 1C24TDT_00003708.json
✓ tokenised: 1C24THS_00001675.json
✓ tokenised: 1C24THS_00001799.json
✓ tokenised: 1C24THS_00001857.json
✓ tokenised: 1C24THS_00001916.json
✓ tokenised: 1C24THS_00001981.json
✓ tokenised: 1C24THS_00002013.json
✓ tokenised: 1C24THS_00002170.json
✓ tokenised: 1C24THS_00002196.json
✓ tokenised: 1C24THT_00005608.json
✓ tokenised: 1C24THT_00005623.json
✓ tokenised: 1C24TLT_00003719.json
✓ tokenised: 1C24TLT_00003732.json
✓ tokenised: 1C24TLT_00004099.json
✓ tokenised: 1C24TLT_00004105.json
✓ tokenised: 1C24TLT_00004521.json
✓ tokenised: 1C24TLT_00004527.json
✓ tokenised: 1C24TLT_00005255.json
✓ tokenised: 1C24TLT_00005280.json
✓ tokenised: 1C24TLT_00005742.json
✓ tokenised: 1C24TLT_00005755.json
✓ tokenised: 1C24TLT_00006047.json
✓ tokenised: 1C24TLT_00006060.json
✓ tokenised: 1C24TPG_00003832.json
✓ tokenised: 1C24TPG_00003842.json
✓ tokenised: 1C24TPG_00003846.json
✓ tokenised: 1C24TPG