In [2]:
# --- Imports ---
import pandas as pd
from pathlib import Path

In [3]:
# --- Load data (TSV) ---
src_path = Path("processed_data/Tagged_Titles_Train_train.tsv")  # adjust if needed
df = pd.read_csv(src_path, sep="\t")
print("Loaded:", src_path, "| rows:", len(df))
df.head()

Loaded: processed_data/Tagged_Titles_Train_train.tsv | rows: 51184


Unnamed: 0,Record Number,Category,Title,Token,Tag
0,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,MINI,Kompatible_Fahrzeug_Marke
1,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,1.6,Kompatibles_Fahrzeug_Modell
2,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W10B16A,Herstellernummer
3,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W11B16A,
4,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,R50,


In [4]:
# --- Convert Tag -> BIO ---
# Rules:
# - Empty / NaN / "O" -> "O"
# - For non-"O" tags: first token of a contiguous span -> B-<ASPECT>, subsequent -> I-<ASPECT>
def to_bio(group: pd.DataFrame) -> pd.DataFrame:
    tags = group["Tag"].fillna("").astype(str).str.strip()
    bio = []
    prev = "O"
    for t in tags:
        if t == "" or t.upper() == "O":
            bio.append("O")
            prev = "O"
        else:
            bio.append(f"I-{t}" if prev == t else f"B-{t}")
            prev = t
    group["BIO_Tag"] = bio
    return group

# Apply per title (Record Number) to preserve order/contiguity
df_bio = df.groupby("Record Number", sort=False, group_keys=False).apply(to_bio)
df_bio.head()

  df_bio = df.groupby("Record Number", sort=False, group_keys=False).apply(to_bio)


Unnamed: 0,Record Number,Category,Title,Token,Tag,BIO_Tag
0,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,MINI,Kompatible_Fahrzeug_Marke,B-Kompatible_Fahrzeug_Marke
1,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,1.6,Kompatibles_Fahrzeug_Modell,B-Kompatibles_Fahrzeug_Modell
2,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W10B16A,Herstellernummer,B-Herstellernummer
3,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W11B16A,,O
4,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,R50,,O


In [5]:
# --- Save BIO-augmented TSV ---
out_dir = Path("processed_data"); out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "Tagged_Titles_Train_train_with_BIO.tsv"
df_bio.to_csv(out_path, sep="\t", index=False, encoding="utf-8")
print("Saved:", out_path, "| rows:", len(df_bio))

Saved: processed_data/Tagged_Titles_Train_train_with_BIO.tsv | rows: 51184


In [6]:
# --- Process validation split (Tagged_Titles_Train_val.tsv) ---

In [7]:
# --- Load data (TSV) ---
src_path = Path("processed_data/Tagged_Titles_Train_val.tsv")  
df = pd.read_csv(src_path, sep="\t")
print("Loaded:", src_path, "| rows:", len(df))
df.head()

Loaded: processed_data/Tagged_Titles_Train_val.tsv | rows: 5628


Unnamed: 0,Record Number,Category,Title,Token,Tag
0,3,1,Textar Bremsscheiben + Bremsbeläge hinten für ...,Textar,Hersteller
1,3,1,Textar Bremsscheiben + Bremsbeläge hinten für ...,Bremsscheiben,Im_Lieferumfang_Enthalten
2,3,1,Textar Bremsscheiben + Bremsbeläge hinten für ...,+,O
3,3,1,Textar Bremsscheiben + Bremsbeläge hinten für ...,Bremsbeläge,Im_Lieferumfang_Enthalten
4,3,1,Textar Bremsscheiben + Bremsbeläge hinten für ...,hinten,Einbauposition


In [8]:
# --- Convert Tag -> BIO ---
df_bio = df.groupby("Record Number", sort=False, group_keys=False).apply(to_bio)

  df_bio = df.groupby("Record Number", sort=False, group_keys=False).apply(to_bio)


In [9]:
# --- Save BIO-augmented TSV ---
out_dir = Path("processed_data"); out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "Tagged_Titles_Train_val_with_BIO.tsv"
df_bio.to_csv(out_path, sep="\t", index=False, encoding="utf-8")
print("Saved:", out_path, "| rows:", len(df_bio))

Saved: processed_data/Tagged_Titles_Train_val_with_BIO.tsv | rows: 5628
