In [None]:
# SMS Spam Classification â€“ Data Preparation Notebook

import os
import pandas as pd
from sklearn.model_selection import train_test_split

# -----------------------------
# 1. Load data
# -----------------------------

def load_data(file_path):
    """
    Load SMS spam dataset from a given file path.
    Expected format (UCI SMS Spam Collection):
    label\tmessage
    """
    df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'text'])
    return df


# -----------------------------
# 2. Preprocess data
# -----------------------------

def preprocess_data(df):
    """
    Basic preprocessing:
    - lowercase text
    - strip whitespace
    - map labels to binary
    """
    df = df.copy()
    df['text'] = df['text'].str.lower().str.strip()
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    return df


# -----------------------------
# 3. Split data
# -----------------------------

def split_data(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    """
    Split data into train, validation, and test sets.
    """
    train_df, temp_df = train_test_split(
        df,
        test_size=(1 - train_size),
        stratify=df['label'],
        random_state=random_state
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=test_size / (val_size + test_size),
        stratify=temp_df['label'],
        random_state=random_state
    )

    return train_df, val_df, test_df


# -----------------------------
# 4. Save splits
# -----------------------------

def save_splits(train_df, val_df, test_df, out_dir="."):
    """
    Save train/validation/test splits as CSV files.
    """
    os.makedirs(out_dir, exist_ok=True)
    train_df.to_csv(os.path.join(out_dir, 'train.csv'), index=False)
    val_df.to_csv(os.path.join(out_dir, 'validation.csv'), index=False)
    test_df.to_csv(os.path.join(out_dir, 'test.csv'), index=False)


# -----------------------------
# Example execution
# -----------------------------
if __name__ == "__main__":
    data_path = "SMSSpamCollection"  # update path if needed
    df = load_data(data_path)
    df = preprocess_data(df)
    train_df, val_df, test_df = split_data(df)
    save_splits(train_df, val_df, test_df)
    print("Data preparation completed.")
