In [2]:
"""
clean_field_label_leak.py
─────────────────────────
Repairs JSONL invoices where field-label words are inside BIO spans.

Algorithm
1.  For each span type below, collect the contiguous tokens that have the
    tag and decide which tokens are *values* vs. *labels*:

    TAG                keep-regex
    ───────────────────────────────────────────────
    SELLER_TAX_CODE    r'^\d{10}$'
    BUYER_TAX_CODE     r'^\d{10}$'
    SERIAL             r'^[0-9A-Z]{6,}$'
    INVOICE_NO         r'^\d{4,}$'
    PAYMENT_METHOD     r'[/]'   or token.lower().startswith('tm')
    TOTAL_*            r'^\d+(\.\d+)+$'

2.  Inside that span
      • first value token → B-TAG
      • next value tokens → I-TAG
      • non-value tokens → O

3.  Merge duplicate B-BUYER_NAME islands (first B stays, others → I).

After running, every span begins with B-…, field-label tokens are O,
and the four invoices you pasted become BIO-valid.
"""

import json, re, pathlib
from tqdm.auto import tqdm

SRC = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-SingleFILE/qhuy_data/qhuy_data.jsonl"    # input
DST = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-SingleFILE/qhuy_data/qhuy_data_refined.jsonl"          # output

KEEP_PATTERNS = {
    "SELLER_TAX_CODE": re.compile(r"^\d{10}$"),
    "BUYER_TAX_CODE":  re.compile(r"^\d{10}$"),
    "SERIAL":          re.compile(r"^[0-9A-Z]{6,}$"),
    "INVOICE_NO":      re.compile(r"^\d{4,}$"),
    "TOTAL_AMOUNT":    re.compile(r"^\d+(\.\d+)+$"),
    "VAT_AMOUNT":      re.compile(r"^\d+(\.\d+)+$"),
    "TOTAL_PAYMENT":   re.compile(r"^\d+(\.\d+)+$"),
}

def keep_token(tag, token):
    if tag == "PAYMENT_METHOD":
        return "/" in token or token.lower().startswith("tm")
    pat = KEEP_PATTERNS.get(tag)
    return bool(pat and pat.match(token))

def clean_span(tokens, labels, tag):
    i = 0
    while i < len(labels):
        if labels[i] == f"B-{tag}":
            span = []
            j = i
            while j < len(labels) and labels[j].startswith(("B-"+tag, "I-"+tag)):
                span.append(j)
                j += 1
            # retain only tokens that match the keep-rule
            keep_idx = [idx for idx in span if keep_token(tag, tokens[idx])]
            if not keep_idx:                       # no real value → wipe span
                for idx in span: labels[idx] = "O"
            else:
                first = keep_idx[0]
                for idx in span:
                    if idx == first:
                        labels[idx] = f"B-{tag}"
                    elif idx in keep_idx:
                        labels[idx] = f"I-{tag}"
                    else:
                        labels[idx] = "O"
            i = j
        else:
            i += 1

def merge_buyer_name(labels):
    inside = False
    for k, lab in enumerate(labels):
        if lab.startswith(("B-BUYER_NAME", "I-BUYER_NAME")):
            if lab.startswith("B-"):
                if inside:
                    labels[k] = "I-BUYER_NAME"
                inside = True
        else:
            inside = False

def fix_record(rec):
    tok, lab = rec["words"], rec["labels"]
    for tag in ["SELLER_TAX_CODE", "BUYER_TAX_CODE",
                "SERIAL", "INVOICE_NO",
                "PAYMENT_METHOD",
                "TOTAL_AMOUNT", "VAT_AMOUNT", "TOTAL_PAYMENT"]:
        clean_span(tok, lab, tag)
    merge_buyer_name(lab)
    rec["labels"] = lab
    return rec

with pathlib.Path(DST).open("w", encoding="utf-8") as out_f, \
     pathlib.Path(SRC).open(encoding="utf-8") as in_f:
    for raw in tqdm(in_f, desc="BIO repair"):
        out_f.write(json.dumps(fix_record(json.loads(raw)), ensure_ascii=False) + "\n")

print("✅  All field-label leaks removed →", DST)


  """


BIO repair: 0it [00:00, ?it/s]

✅  All field-label leaks removed → /mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-SingleFILE/qhuy_data/qhuy_data_refined.jsonl
