In [10]:
import json
from transformers import AutoTokenizer

# Use any Fast tokenizer with offset_mapping support (for alignment only)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", use_fast=True)

# === Config ===
input_path = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/data/invoices.jsonl"
output_path = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/data/phobert_bio_format.txt"

def char_span_to_bio(text, spans):
    encoded = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'])
    offsets = encoded['offset_mapping']

    tags = ["O"] * len(tokens)

    for start, end, label in spans:
        tagged = False
        for i, (tok_start, tok_end) in enumerate(offsets):
            if tok_start < end and tok_end > start:  # ← FIXED overlap check
                if not tagged:
                    tags[i] = f"B-{label}"
                    tagged = True
                else:
                    tags[i] = f"I-{label}"

    return list(zip(tokens, tags))


# Convert all lines
bio_lines = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        text = data["text"]
        spans = data.get("label", [])
        token_tags = char_span_to_bio(text, spans)

        for token, tag in token_tags:
            bio_lines.append(f"{token} {tag}")
        bio_lines.append("")

with open(output_path, "w", encoding="utf-8") as out:
    out.write("\n".join(bio_lines))

print(f"✅ BIO tagging done! Saved to: {output_path}")


✅ BIO tagging done! Saved to: /mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/data/phobert_bio_format.txt
