## load Dataset


In [45]:
import pandas as pd
import re
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords


In [46]:
file_path = 'emails.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


## remove subject 

In [47]:

data['text'] = data['text'].str.replace(r'^Subject:\s*', '', regex=True)
data.head()

Unnamed: 0,text,spam
0,naturally irresistible your corporate identity...,1
1,the stock trading gunslinger fanny is merrill...,1
2,unbelievable new homes made easy im wanting t...,1
3,4 color printing special request additional i...,1
4,"do not have money , get software cds from here...",1


## Make all emails text to lower case


In [48]:
data['text'] = data['text'].str.lower()
data.head()

Unnamed: 0,text,spam
0,naturally irresistible your corporate identity...,1
1,the stock trading gunslinger fanny is merrill...,1
2,unbelievable new homes made easy im wanting t...,1
3,4 color printing special request additional i...,1
4,"do not have money , get software cds from here...",1


## remove special characters and numbers


In [49]:
def replace_numbers_and_remove_punctuations(text):
   
    text = re.sub(r'\$ ?\d+|\d+ ?\$', ' moneynumber ', text)
   
    text = re.sub(r'\b\d+\b', ' number ', text)
   
    text = re.sub(r'[^\w\s]', '', text)
    return text


data['text'] = data['text'].apply(replace_numbers_and_remove_punctuations)


## tokenize the email text


In [50]:
def tokenize_email_words(text):
    return nltk.word_tokenize(text)

data['text'] = data['text'].apply(tokenize_email_words)
data.head()

Unnamed: 0,text,spam
0,"[naturally, irresistible, your, corporate, ide...",1
1,"[the, stock, trading, gunslinger, fanny, is, m...",1
2,"[unbelievable, new, homes, made, easy, im, wan...",1
3,"[number, color, printing, special, request, ad...",1
4,"[do, not, have, money, get, software, cds, fro...",1


## stop words removal


In [51]:
def stop_words_removal(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token.lower() not in stop_words]

data['text']= data['text'].apply(stop_words_removal)
data.head()

Unnamed: 0,text,spam
0,"[naturally, irresistible, corporate, identity,...",1
1,"[stock, trading, gunslinger, fanny, merrill, m...",1
2,"[unbelievable, new, homes, made, easy, im, wan...",1
3,"[number, color, printing, special, request, ad...",1
4,"[money, get, software, cds, software, compatib...",1


## text  lemmatization


In [52]:
# Initialize the WordNet lemmatizer
lemmatizer = nltk.WordNetLemmatizer()

# Helper function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Define a function for lemmatization
def lemmatize_text(tokens):
    # Get POS tags for the tokens
    pos_tags = nltk.pos_tag(tokens)
    # Lemmatize each token with its POS tag
    return [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]

# Apply the lemmatization function to the tokenized email text
data['text']= data['text'].apply(lemmatize_text)
data.head()

Unnamed: 0,text,spam
0,"[naturally, irresistible, corporate, identity,...",1
1,"[stock, trading, gunslinger, fanny, merrill, m...",1
2,"[unbelievable, new, home, make, easy, im, want...",1
3,"[number, color, print, special, request, addit...",1
4,"[money, get, software, cd, software, compatibi...",1


In [53]:
# Save the DataFrame to a CSV file
data.to_csv('processed_emails.csv', index=False)
