In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download

# Download necessary NLTK datasets
download('punkt')
download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ujandasgupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ujandasgupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading Data

In [2]:
def load_data(file_path):
    return pd.read_csv(file_path)

# Preprocessing the Data

In [3]:
def preprocess_text(text):
    """
    Convert text to lowercase, remove non-alphabetic characters,
    and remove stopwords.

    Parameters:
    - text (str): The email text to preprocess.

    Returns:
    - str: The preprocessed email text.
    """
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Re-join tokens into a single string
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

In [4]:
def preprocess_data(data):
    # Apply text preprocessing to the 'text' column 
    data['text'] = data['text'].apply(preprocess_text)
    return data

# Splitting the Data

In [5]:
def split_data(data, test_size=0.2, validation_size=0.25):
    # Splitting data into train and temp data (which will be further split into validation and test)
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=42)
    # Adjusting validation size based on the new size of temp_data
    validation_size_adjusted = validation_size / (1 - test_size)
    validation_data, test_data = train_test_split(temp_data, test_size=validation_size_adjusted, random_state=42)
    
    return train_data, validation_data, test_data

# Storing the Splits

In [6]:
def store_splits(train_data, validation_data, test_data, train_path='train.csv', validation_path='validation.csv', test_path='test.csv'):
    train_data.to_csv(train_path, index=False)
    validation_data.to_csv(validation_path, index=False)
    test_data.to_csv(test_path, index=False)

# Main function to execute the steps

In [7]:
def main(file_path):
    data = load_data(file_path)
    preprocessed_data = preprocess_data(data)
    train_data, validation_data, test_data = split_data(preprocessed_data)
    store_splits(train_data, validation_data, test_data)

In [8]:
if __name__ == '__main__':
    file_path = 'emails.csv' 
    main(file_path)