In [2]:
# --- Imports & tokenizer ---
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer

TOKENIZER = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER, use_fast=True)
MAX_LEN = 160  # adjust if needed

In [3]:
# --- Build stable label maps (ensure 'O' exists) ---
def build_label_maps(df: pd.DataFrame):
    tags = df["BIO_Tag"].dropna().astype(str)
    uniq = ["O"] + sorted(t for t in tags.unique().tolist() if t != "O")
    label2id = {t: i for i, t in enumerate(uniq)}
    id2label = {i: t for t, i in label2id.items()}
    return label2id, id2label

In [4]:
# --- Tokenize one title (group) and align word-level BIO to subwords ---
def tokenize_align_group(group: pd.DataFrame, label2id: dict, max_len: int = MAX_LEN):
    words  = group["Token"].astype(str).tolist()
    rnum   = group["Record Number"].iloc[0]

    enc = tokenizer(
        words,
        is_split_into_words=True,
        truncation=True,
        max_length=max_len,
        return_attention_mask=True
    )
    word_ids = enc.word_ids()
    sub_tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])

    bio = group["BIO_Tag"].astype(str).tolist()
    rows = []
    prev = None
    for pos, wid in enumerate(word_ids):
        # special tokens (None) or continuation subwords -> ignore in loss with -100
        if wid is None:
            label_id = -100
            label_txt = "-100"
        elif wid != prev:
            label_txt = bio[wid] if bio[wid] and bio[wid] != "nan" else "O"
            label_id = label2id.get(label_txt, label2id["O"])
        else:
            label_txt = "-100"
            label_id  = -100

        rows.append({
            "Record Number": rnum,
            "Subword": sub_tokens[pos],
            "WordID": -1 if wid is None else wid,
            "BIO_Label": label_txt,
            "Label_ID": label_id,
            "Input_ID": enc["input_ids"][pos]
        })
        prev = wid
    return rows

In [5]:
# --- End-to-end: process a file → save aligned TSV ---
def align_and_save(src_path: Path, out_path: Path):
    df = pd.read_csv(src_path, sep="\t")
    assert {"Record Number","Token","BIO_Tag"}.issubset(df.columns), "Missing required columns."
    label2id, _ = build_label_maps(df)

    aligned_rows = []
    for _, g in df.groupby("Record Number", sort=False):
        aligned_rows.extend(tokenize_align_group(g, label2id))

    out_path.parent.mkdir(parents=True, exist_ok=True)
    pd.DataFrame(aligned_rows).to_csv(out_path, sep="\t", index=False, encoding="utf-8")
    print(f"Saved aligned TSV → {out_path} | rows: {len(aligned_rows)}")

In [6]:
# --- Run for train/val (edit paths if needed) ---
align_and_save(
    Path("processed_data/Tagged_Titles_Train_train_with_BIO.tsv"),
    Path("processed_data/aligned_train_subwords.tsv")
)

align_and_save(
    Path("processed_data/Tagged_Titles_Train_val_with_BIO.tsv"),
    Path("processed_data/aligned_val_subwords.tsv")
)

Saved aligned TSV → processed_data/aligned_train_subwords.tsv | rows: 124489
Saved aligned TSV → processed_data/aligned_val_subwords.tsv | rows: 13560
