In [2]:
# ---- 1. Import dependencies ----
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from pathlib import Path

# ---- 2. Read the original dataset ----
df = pd.read_csv("data/Tagged_Titles_Train.tsv", sep="\t")

print("Dataset size:", len(df))
print("Columns:", df.columns.tolist())
df.head()

Dataset size: 56812
Columns: ['Record Number', 'Category', 'Title', 'Token', 'Tag']


Unnamed: 0,Record Number,Category,Title,Token,Tag
0,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,MINI,Kompatible_Fahrzeug_Marke
1,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,1.6,Kompatibles_Fahrzeug_Modell
2,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W10B16A,Herstellernummer
3,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,W11B16A,
4,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,R50,


In [3]:
# ---- 3. Build mapping from Record Number -> Category ----
# Each Record Number belongs to one Category, so we use it for stratified splitting
rec2cat = (
    df.groupby("Record Number")["Category"]   # group all rows by Record Number, look only at Category
      .first()                                # take the first Category in each group (they are all the same anyway)
      .reset_index()                          # turn Record Number back into a normal column
      .rename(columns={"Record Number": "rid", "Category": "cat"})  # rename columns to shorter names
)

print("Total records:", len(rec2cat))
rec2cat.head()

Total records: 5000


Unnamed: 0,rid,cat
0,1,2
1,2,1
2,3,1
3,4,2
4,5,1


In [4]:
# ---- 4. Perform stratified random split (90% train / 10% validation) ----
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.10, random_state=42)
train_idx, val_idx = next(splitter.split(rec2cat[["rid"]], rec2cat["cat"]))

train_rids = set(rec2cat.iloc[train_idx]["rid"])
val_rids   = set(rec2cat.iloc[val_idx]["rid"])

print("Train records:", len(train_rids))
print("Validation records:", len(val_rids))

Train records: 4500
Validation records: 500


In [5]:
# ---- 5. Filter the full dataframe into train and validation sets ----
df_train = df[df["Record Number"].isin(train_rids)].copy()
df_val   = df[df["Record Number"].isin(val_rids)].copy()

print("Train dataset size:", len(df_train))
print("Validation dataset size:", len(df_val))

Train dataset size: 51184
Validation dataset size: 5628


In [6]:
# ---- 6. Create output folder ----
out_dir = Path("processed_data")
out_dir.mkdir(parents=True, exist_ok=True)

# ---- 7. Save the splits as TSV files with clearer names ----
df_train.to_csv(out_dir / "Tagged_Titles_Train_train.tsv", sep="\t", index=False, encoding="utf-8")
df_val.to_csv(out_dir / "Tagged_Titles_Train_val.tsv", sep="\t", index=False, encoding="utf-8")

print("Files saved in 'processed_data/' ✅")

Files saved in 'processed_data/' ✅


In [26]:
df["Title"].head(27)

0     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
1     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
2     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
3     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
4     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
5     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
6     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
7     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
8     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
9     MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...
10    ATE Power Disc Bremsenset Mercedes SLC + SLK V...
11    ATE Power Disc Bremsenset Mercedes SLC + SLK V...
12    ATE Power Disc Bremsenset Mercedes SLC + SLK V...
13    ATE Power Disc Bremsenset Mercedes SLC + SLK V...
14    ATE Power Disc Bremsenset Mercedes SLC + SLK V...
15    ATE Power Disc Bremsenset Mercedes SLC + SLK V...
16    ATE Power Disc Bremsenset Mercedes SLC + SLK V...
17    ATE Power Disc Bremsenset Mercedes SLC + S