In [None]:
# Tagged_Titles_Train_val.tsv
# Tagged_Titles_Train_val_test.tsv
# predictions_token_logreg.tsv
# predictions_token_level_crf.tsv               Final score: 0.775856
# predictions_token_level_xlmroberta_0.tsv      Final score: 0.764804
# predictions_token_level_bilstm_char_gaz_crf.tsv

In [75]:
# Weighted F_beta evaluator (beta=0.2) for token-level TSVs
# Input files must have columns: Record Number, Category, Title, Token, Tag
import pandas as pd
import numpy as np
from pathlib import Path

# ====== user inputs ======
GOLD_PATH = Path("processed_data/Tagged_Titles_Train_val.tsv")  # labeled file (token-level)
PRED_PATH = Path("model_output/predictions_token_level_bilstm_char_gaz_crf.tsv")                 # predicted file (token-level)
BETA = 0.2
EXCLUDE = {"O"}  # exclude tag "O" from evaluation
# =========================

def load_token_tsv(p: Path) -> pd.DataFrame:
    """Load token-level TSV and normalize column names / whitespace."""
    df = pd.read_csv(p, sep="\t", dtype=str, keep_default_na=False, engine="python")
    df.columns = [c.strip() for c in df.columns]
    need = ["Record Number", "Category", "Title", "Token", "Tag"]
    miss = [c for c in need if c not in df.columns]
    if miss: raise ValueError(f"Missing columns {miss} in {p}")
    for c in need:
        df[c] = df[c].astype(str).str.strip()
    return df

def collapse_to_aspects(df: pd.DataFrame) -> pd.DataFrame:
    """Collapse consecutive same-Tag tokens into aspect values per (Record, Category)."""
    rows = []
    for (rn, cat), g in df.groupby(["Record Number", "Category"], sort=False):
        curr_tag, buf = None, []
        def flush():
            nonlocal curr_tag, buf
            if curr_tag and len(buf) > 0:
                val = " ".join(buf).strip()
                val = " ".join(val.split())  # collapse internal spaces
                if val:
                    rows.append((rn, cat, curr_tag, val))
            curr_tag, buf = None, []
        for _, r in g.iterrows():  # keep original row order
            tag, tok = r["Tag"], r["Token"]
            if tag in EXCLUDE or tag == "":
                flush()
                continue
            if curr_tag is None or tag != curr_tag:
                flush()
                curr_tag, buf = tag, [tok]
            else:
                buf.append(tok)
        flush()
    out = pd.DataFrame(rows, columns=["Record Number", "Category", "Aspect", "Value"])
    # de-dup within same (record, category, aspect, value)
    out = out.drop_duplicates(ignore_index=True)
    return out

def index_by_cat_aspect(df: pd.DataFrame):
    """Return dict: (cat, aspect) -> set of (record_number, value)."""
    s = df.copy()
    s["Value"] = s["Value"].apply(lambda x: " ".join(str(x).split()))
    idx = {}
    for (cat, asp), g in s.groupby(["Category", "Aspect"], sort=False):
        idx[(cat, asp)] = {(rn, v) for rn, v in zip(g["Record Number"], g["Value"])}
    return idx

def evaluate_token_files(gold_tok: pd.DataFrame, pred_tok: pd.DataFrame, beta=0.2):
    """Compute weighted F_beta per spec; final score = mean across categories."""
    gold_aspects = collapse_to_aspects(gold_tok[~gold_tok["Tag"].isin(EXCLUDE)])
    pred_aspects = collapse_to_aspects(pred_tok[~pred_tok["Tag"].isin(EXCLUDE)])

    gidx = index_by_cat_aspect(gold_aspects)
    pidx = index_by_cat_aspect(pred_aspects)

    cats = sorted({c for (c, _) in gidx.keys()}, key=lambda x: (str(x).isdigit(), x))
    b2 = beta * beta
    cat_scores = {}
    detail = []

    for cat in cats:
        aspects = sorted({a for (c, a) in gidx.keys() if c == cat})
        total_gold = sum(len(gidx[(cat, a)]) for a in aspects)
        if total_gold == 0:
            cat_scores[cat] = 0.0
            continue
        acc = 0.0
        for a in aspects:
            gset = gidx[(cat, a)]
            pset = pidx.get((cat, a), set())
            inter = len(gset & pset)
            n_pred, n_gold = len(pset), len(gset)
            prec = inter / n_pred if n_pred > 0 else 0.0
            rec  = inter / n_gold if n_gold > 0 else 0.0
            f = 0.0 if (n_pred == 0 or prec == 0.0 or rec == 0.0) else (1+b2)*prec*rec/(b2*prec+rec)
            w = n_gold / total_gold
            acc += w * f
            detail.append([cat, a, n_gold, n_pred, inter, prec, rec, f, w, w*f])
        cat_scores[cat] = acc

    final_score = float(np.mean(list(cat_scores.values()))) if cat_scores else 0.0
    det_df = pd.DataFrame(detail, columns=[
        "Category","Aspect","GoldCount","PredCount","Intersection",
        "Precision","Recall","F_beta","Weight","Weighted_F_beta"
    ])
    cat_df = pd.DataFrame(
        [{"Category": c, "Category_F_beta": round(s, 6)} for c, s in cat_scores.items()]
    )
    return final_score, cat_df.sort_values("Category"), det_df.sort_values(["Category","Aspect"])

# --- Evaluate and pretty-print for β=0.2 and β=1.0 (F1) ---
gold_tok = load_token_tsv(GOLD_PATH)
pred_tok = load_token_tsv(PRED_PATH)

def run_eval(beta: float):
    """Return (final_score, cat_scores_df, details_df) for a given beta."""
    return evaluate_token_files(gold_tok, pred_tok, beta=beta)

# main metrics
final_beta, cat_beta, det_beta = run_eval(BETA)
final_f1,   cat_f1,   det_f1   = run_eval(1.0)

# prints
print(f"Final score (mean category weighted Fβ, β={BETA}): {final_beta:.6f}")
print(f"Final F1 score (mean category weighted Fβ, β=1.0): {final_f1:.6f}\n")

# tables
display(cat_beta.reset_index(drop=True))
display(det_beta.head(20))  # preview first 20 rows

Final score (mean category weighted Fβ, β=0.2): 0.684877
Final F1 score (mean category weighted Fβ, β=1.0): 0.662427



Unnamed: 0,Category,Category_F_beta
0,1,0.740777
1,2,0.628977


Unnamed: 0,Category,Aspect,GoldCount,PredCount,Intersection,Precision,Recall,F_beta,Weight,Weighted_F_beta
0,1,Anzahl_Der_Einheiten,13,4,4,1.0,0.307692,0.920354,0.007688,0.007075
1,1,Besonderheiten,1,1,1,1.0,1.0,1.0,0.000591,0.000591
2,1,Bremsscheiben-Aussendurchmesser,108,104,93,0.894231,0.861111,0.89291,0.063868,0.057028
3,1,Bremsscheibenart,32,24,23,0.958333,0.71875,0.946203,0.018924,0.017906
4,1,Einbauposition,238,240,213,0.8875,0.894958,0.887785,0.140745,0.124951
5,1,Größe,1,1,1,1.0,1.0,1.0,0.000591,0.000591
6,1,Hersteller,177,173,168,0.971098,0.949153,0.970235,0.104672,0.101556
7,1,Herstellernummer,7,9,1,0.111111,0.142857,0.112069,0.00414,0.000464
8,1,Im_Lieferumfang_Enthalten,336,338,284,0.840237,0.845238,0.840428,0.198699,0.166992
9,1,Kompatible_Fahrzeug_Marke,263,213,197,0.924883,0.749049,0.916607,0.155529,0.142559
