In [2]:
"""
patch_buyer_name_and_payment.py
───────────────────────────────
• Move the B-BUYER_NAME tag from the field-label token
  “Tên đơn vị …” to the first following token that is not
  one of the label words.
• If the span PAYMENT_METHOD is completely missing, tag the
  token that contains ‘/’ or equals ‘Tiền mặt’ (or lowercase
  variant) as B-PAYMENT_METHOD.
"""

import json, pathlib, re
from tqdm.auto import tqdm

SRC = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-SingleFILE/mcuong_data/mcuong_data_refined.jsonl"       # ← your last output
DST = "mcuong_data_refined_v2.jsonl"       # → fully correct

LABEL_WORDS = {"Tên", "đơn", "vị", "(Company's", "name):"}
PAY_RE      = re.compile(r"/")         # catches TM/CK, etc.

def fix_buyer_name(words, labels):
    i = 0
    while i < len(labels):
        if labels[i] == "B-BUYER_NAME":
            # collect current span
            span = []
            j = i
            while j < len(labels) and labels[j].startswith(("B-BUYER_NAME", "I-BUYER_NAME")):
                span.append(j)
                j += 1
            # first real token
            first_real = next((idx for idx in span if words[idx] not in LABEL_WORDS), None)
            if first_real is None:  # no real token – drop span
                for idx in span: labels[idx] = "O"
            else:
                for idx in span:
                    if idx == first_real:
                        labels[idx] = "B-BUYER_NAME"
                    elif words[idx] not in LABEL_WORDS:
                        labels[idx] = "I-BUYER_NAME"
                    else:
                        labels[idx] = "O"
            i = j
        else:
            i += 1

def ensure_payment_method(words, labels):
    if "B-PAYMENT_METHOD" in labels:
        return  # already present
    for idx, (w, l) in enumerate(zip(words, labels)):
        if l == "O" and (PAY_RE.search(w) or w.lower().startswith("tiền")):
            labels[idx] = "B-PAYMENT_METHOD"
            break

with pathlib.Path(DST).open("w", encoding="utf-8") as out_f, \
     pathlib.Path(SRC).open(encoding="utf-8") as in_f:
    for line in tqdm(in_f, desc="Final BIO patch"):
        rec = json.loads(line)
        fix_buyer_name(rec["words"], rec["labels"])
        ensure_payment_method(rec["words"], rec["labels"])
        json.dump(rec, out_f, ensure_ascii=False)
        out_f.write("\n")

print("✅  Buyer-name and payment-method tags fixed →", DST)


Final BIO patch: 0it [00:00, ?it/s]

✅  Buyer-name and payment-method tags fixed → mcuong_data_refined_v2.jsonl
