In [None]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
INPUT_FILE = Path("raw_data.tsv")
OUTPUT_FILE = Path("clean_data.csv")

assert INPUT_FILE.exists()

In [None]:
df_input = pd.read_csv(INPUT_FILE, sep="\t", header=None)
df_input.columns = ["class", "text"]
df_input["class"] = df_input["class"].map({"ham": 0, "spam": 1})

In [None]:
# Remove duplicates

print("Rows before dropping duplicates:", len(df_input))
df_input = df_input.drop_duplicates(subset=["text"])
print("Rows after dropping duplicates:", len(df_input))

In [None]:
# Create an 80/20 train/test split

train_idx, test_idx = train_test_split(
    df_input.index, test_size=0.2, random_state=42, stratify=df_input["class"]
)
df_input["is_train"] = 0
df_input.loc[train_idx, "is_train"] = 1
df_input = df_input[["is_train", "class", "text"]].sort_values(by="is_train")

print("Train set size:", (df_input["is_train"] == 1).sum())
print("Test set size:", (df_input["is_train"] == 0).sum())

In [None]:
df_input[["is_train", "class", "text"]].to_csv(OUTPUT_FILE, index=False)