In [1]:
"""
patch_buyer_label_token.py
──────────────────────────
Move the B-BUYER_NAME tag from any token that *begins with* “Tên”
to the very next token (the actual company name) and mark the label
token `O`.

Run this once on the JSONL that came out of your last step; it will
touch only the remaining faulty lines.
"""

import json, pathlib
from tqdm.auto import tqdm

SRC = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-SingleFILE/mcuong_data/mcuong_data_refined_v2.jsonl"      # ← file you just created
DST = "mcuong_data_refined_v3.jsonl"      # → fully BIO-valid

def fix_buyer_name(words, labels):
    i = 0
    while i < len(labels):
        if labels[i] == "B-BUYER_NAME" and words[i].startswith("Tên"):
            # clear the label token
            labels[i] = "O"
            # promote the very next token to B-BUYER_NAME
            if i + 1 < len(labels):
                labels[i + 1] = "B-BUYER_NAME"
            # any further tokens already inside the span keep I-BUYER_NAME
            i += 2
        else:
            i += 1

with pathlib.Path(DST).open("w", encoding="utf-8") as out_f, \
     pathlib.Path(SRC).open(encoding="utf-8") as in_f:
    for line in tqdm(in_f, desc="Final buyer-name fix"):
        rec = json.loads(line)
        fix_buyer_name(rec["words"], rec["labels"])
        json.dump(rec, out_f, ensure_ascii=False)
        out_f.write("\n")

print("✅  Buyer-name labels fixed – saved to", DST)


Final buyer-name fix: 0it [00:00, ?it/s]

✅  Buyer-name labels fixed – saved to mcuong_data_refined_v3.jsonl
