In [3]:
# ----- fix_bio_jsonl.ipynb cell -----
import json
from pathlib import Path

# ========= CONFIG =========
INPUT_PATH  = Path("/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/BIO-TAGGING-SingleFILE/qhuy_data/lephan.jsonl")     # <-- put your source file here
OUTPUT_PATH = Path("invoices_fixed.jsonl")   # <-- destination file
# If you’d rather rename  "labels"  →  "ner_tags", toggle the flag below.
RENAME_LABELS_FIELD = False
# ===========================


def bio_fix(label_list):
    """
    Convert any I-XX that wrongly starts a span to B-XX.
    Works on a single list of BIO tags.
    """
    fixed = label_list.copy()
    prev_tag, prev_type = "O", None

    for i, tag in enumerate(label_list):
        if tag == "O":
            prev_tag, prev_type = tag, None
            continue

        prefix, ent_type = tag.split("-", 1)

        # If we're at sequence start OR previous tag is O OR previous entity is different,
        # an I- tag is illegal → change to B-
        if prefix == "I" and (prev_tag == "O" or prev_type != ent_type):
            fixed[i] = "B-" + ent_type

        prev_tag, prev_type = fixed[i].split("-", 1)

    return fixed


fixed_lines = 0
with INPUT_PATH.open(encoding="utf-8") as fin, OUTPUT_PATH.open("w", encoding="utf-8") as fout:
    for line in fin:
        record = json.loads(line)

        # The original field can be either "labels" or "ner_tags".
        key = "labels" if "labels" in record else "ner_tags"
        record[key] = bio_fix(record[key])

        if RENAME_LABELS_FIELD and key == "labels":
            record["ner_tags"] = record.pop("labels")

        fout.write(json.dumps(record, ensure_ascii=False) + "\n")
        fixed_lines += 1

print(f"Done. Processed {fixed_lines} records → {OUTPUT_PATH}")


Done. Processed 22 records → invoices_fixed.jsonl
