In [6]:
# Minimal, production-ready cleaner for eBay NER TSV -> titles-only TSV
import pandas as pd
from pathlib import Path

# --- paths (edit if your file paths differ) ---
in_path  = Path("processed_data/Tagged_Titles_Train_val.tsv")
out_path = Path("model_input/Tagged_Titles_TitlesOnly.tsv")

# --- read ---
# Use python engine for robust tab parsing; keep strings as-is (no NA coercion)
df = pd.read_csv(in_path, sep="\t", engine="python", dtype=str, keep_default_na=False)

# --- normalize column names (strip spaces, unify underscores) ---
df.columns = [c.strip().replace(" ", "_") for c in df.columns]

# --- sanity check required columns ---
required = ["Record_Number", "Category", "Title"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}. Found columns: {list(df.columns)}")

# --- select only the needed columns ---
out = df[required].copy()

# --- light cleaning ---
# 1) Trim whitespace
for c in required:
    out[c] = out[c].astype(str).str.strip()

# 2) Collapse internal repeated spaces in Title to a single space
out["Title"] = out["Title"].str.replace(r"\s+", " ", regex=True)

# 3) Drop rows with empty critical fields
out = out[(out["Record_Number"] != "") & (out["Category"] != "") & (out["Title"] != "")]

# --- deduplicate ---
# Keep the first occurrence per (Record_Number, Category, Title)
before = len(out)
out = out.drop_duplicates(subset=required, keep="first")
deduped = before - len(out)

# --- typing & sort for readability ---
# Try to sort numerically by record number (fallback to string if needed)
try:
    out["Record_Number"] = out["Record_Number"].astype(int)
    out = out.sort_values(by=["Record_Number", "Category", "Title"], kind="stable")
except ValueError:
    # If record numbers are not all integers, do a stable lexicographic sort
    out = out.sort_values(by=["Record_Number", "Category", "Title"], kind="stable")

# --- optional: rename columns back to desired header style ---
out = out.rename(columns={"Record_Number": "Record Number"})

# --- write ---
out.to_csv(out_path, sep="\t", index=False, encoding="utf-8")

print(f"Rows in (token-level): {len(df):,}")
print(f"Rows out (titles-only): {len(out):,}")
print(f"Deduplicated rows: {deduped:,}")
print(f"Saved to: {out_path.resolve()}")

Rows in (token-level): 5,628
Rows out (titles-only): 500
Deduplicated rows: 5,128
Saved to: /Users/zhangxijing/ebay-ner-challenge/model_input/Tagged_Titles_TitlesOnly.tsv
