In [1]:
# ── Fix word-level BIO tags so every span starts with B- ────────────────
import json, pathlib
from tqdm.auto import tqdm

SRC = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-SingleFILE/nghia_data/nghia_data.jsonl"          # ← your current file
DST = "nghia_data_refined.jsonl"    # ← output with corrected BIO

def fix_bio_sequence(labels):
    """Return a new list where the first token of each entity is 'B-TAG'
       and all following contiguous tokens of that entity are 'I-TAG'."""
    fixed = []
    prev_entity = None
    for lab in labels:
        if lab == "O":
            fixed.append("O")
            prev_entity = None
            continue

        prefix, entity = lab.split("-", 1)

        if prev_entity != entity:             # new span starts here
            fixed.append(f"B-{entity}")
        else:                                 # continuation of current span
            fixed.append(f"I-{entity}")

        prev_entity = entity
    return fixed

with pathlib.Path(DST).open("w", encoding="utf-8") as out_f:
    with pathlib.Path(SRC).open(encoding="utf-8") as in_f:
        for line in tqdm(in_f, desc="Fixing BIO"):
            rec = json.loads(line)
            rec["labels"] = fix_bio_sequence(rec["labels"])
            json.dump(rec, out_f, ensure_ascii=False)
            out_f.write("\n")

print(f"✔️  Saved corrected file → {DST}")


Fixing BIO: 0it [00:00, ?it/s]

✔️  Saved corrected file → nghia_data_refined.jsonl
