In [43]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib as plot
%matplotlib inline

In [44]:
# data = pd.read_csv('enron_spam_data.csv')[['Message', 'Spam/Ham']].dropna().head(10)
data = pd.read_csv('enron_spam_data.csv')[['Message', 'Spam/Ham']].dropna()

In [45]:
np_arr = data.to_numpy()

In [46]:
mails = pd.DataFrame(np_arr, columns=['message', 'label'])

In [47]:
mails.head()

Unnamed: 0,message,label
0,"gary , production from the high island larger ...",ham
1,- calpine daily gas nomination 1 . doc,ham
2,fyi - see note below - already done .\nstella\...,ham
3,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham
4,"jackie ,\nsince the inlet to 3 river plant is ...",ham


In [48]:
# Function to remove punctuations
import string

def remove_punc(text):
     return "".join([char for char in text if char not in string.punctuation])

mails["message"] = mails["message"].apply(lambda x: remove_punc(x))

In [49]:
mails.head(10)

Unnamed: 0,message,label
0,gary production from the high island larger b...,ham
1,calpine daily gas nomination 1 doc,ham
2,fyi see note below already done \nstella\n ...,ham
3,fyi \n forwarded by lauri...,ham
4,jackie \nsince the inlet to 3 river plant is s...,ham
5,george \ni need the following done \njan 13\nz...,ham
6,fyi\n forwarded by gary l...,ham
7,there are two fields of gas that i am having d...,ham
8,thanks so much for the memo i would like to r...,ham
9,the purpose of the email is to recap the kicko...,ham


In [50]:
# function to remove \n
def remove_newline(text):
    return ''.join(text.splitlines())

mails["message"] = mails["message"].apply(lambda x: remove_newline(x))

In [51]:
mails.head(10)

Unnamed: 0,message,label
0,gary production from the high island larger b...,ham
1,calpine daily gas nomination 1 doc,ham
2,fyi see note below already done stella ...,ham
3,fyi forwarded by lauri a...,ham
4,jackie since the inlet to 3 river plant is shu...,ham
5,george i need the following done jan 13zero ou...,ham
6,fyi forwarded by gary l p...,ham
7,there are two fields of gas that i am having d...,ham
8,thanks so much for the memo i would like to r...,ham
9,the purpose of the email is to recap the kicko...,ham


In [52]:
# tokenization
import re

def text_tokenize(text):
    return re.split('\W+', text)

mails['text_tokenized'] = mails['message'].apply(lambda x: text_tokenize(x))

In [53]:
mails.head(10)

Unnamed: 0,message,label,text_tokenized
0,gary production from the high island larger b...,ham,"[gary, production, from, the, high, island, la..."
1,calpine daily gas nomination 1 doc,ham,"[, calpine, daily, gas, nomination, 1, doc]"
2,fyi see note below already done stella ...,ham,"[fyi, see, note, below, already, done, stella,..."
3,fyi forwarded by lauri a...,ham,"[fyi, forwarded, by, lauri, a, allen, hou, ect..."
4,jackie since the inlet to 3 river plant is shu...,ham,"[jackie, since, the, inlet, to, 3, river, plan..."
5,george i need the following done jan 13zero ou...,ham,"[george, i, need, the, following, done, jan, 1..."
6,fyi forwarded by gary l p...,ham,"[fyi, forwarded, by, gary, l, payne, hou, ect,..."
7,there are two fields of gas that i am having d...,ham,"[there, are, two, fields, of, gas, that, i, am..."
8,thanks so much for the memo i would like to r...,ham,"[thanks, so, much, for, the, memo, i, would, l..."
9,the purpose of the email is to recap the kicko...,ham,"[the, purpose, of, the, email, is, to, recap, ..."


In [54]:
# https://stackabuse.com/removing-stop-words-from-strings-in-python/
# stop words
my_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [55]:
# remove stop words
def remove_stopwords(token):
    return [word for word in token if word not in my_stopwords]

In [56]:
mails["none_stopwords"] = mails['text_tokenized'].apply(lambda x: remove_stopwords(x))

In [57]:
mails.head(10)

Unnamed: 0,message,label,text_tokenized,none_stopwords
0,gary production from the high island larger b...,ham,"[gary, production, from, the, high, island, la...","[gary, production, high, island, larger, block..."
1,calpine daily gas nomination 1 doc,ham,"[, calpine, daily, gas, nomination, 1, doc]","[, calpine, daily, gas, nomination, 1, doc]"
2,fyi see note below already done stella ...,ham,"[fyi, see, note, below, already, done, stella,...","[fyi, see, note, already, done, stella, forwar..."
3,fyi forwarded by lauri a...,ham,"[fyi, forwarded, by, lauri, a, allen, hou, ect...","[fyi, forwarded, lauri, allen, hou, ect, 12, 1..."
4,jackie since the inlet to 3 river plant is shu...,ham,"[jackie, since, the, inlet, to, 3, river, plan...","[jackie, since, inlet, 3, river, plant, shut, ..."
5,george i need the following done jan 13zero ou...,ham,"[george, i, need, the, following, done, jan, 1...","[george, need, following, done, jan, 13zero, 0..."
6,fyi forwarded by gary l p...,ham,"[fyi, forwarded, by, gary, l, payne, hou, ect,...","[fyi, forwarded, gary, l, payne, hou, ect, 12,..."
7,there are two fields of gas that i am having d...,ham,"[there, are, two, fields, of, gas, that, i, am...","[two, fields, gas, difficulty, unifysystem, 1,..."
8,thanks so much for the memo i would like to r...,ham,"[thanks, so, much, for, the, memo, i, would, l...","[thanks, much, memo, would, like, reiterate, s..."
9,the purpose of the email is to recap the kicko...,ham,"[the, purpose, of, the, email, is, to, recap, ...","[purpose, email, recap, kickoff, meeting, held..."


In [58]:
# remove stopwords with nltk library
# ==================================
# import nltk
# stopwords = nltk.corpus.stopwords.words("english")

# def remove_stopwords(token):
#     text = [word for word in token if word not in stopwords]
#     return text

# mails['none_stopwords'] = mails['text_tokenized'].apply(lambda x: remove_stopwords(x))
# data.head()

In [59]:
# Stemming and Lemmatization
# https://www.turing.com/kb/stemming-vs-lemmatization-in-python

import nltk

s = nltk.PorterStemmer()
def stemming(token):
    return [s.stem(word) for word in token]

l = nltk.stem.WordNetLemmatizer()

def lemmatizer(token):
    return [l.lemmatize(word) for word in token]

mails["stemmed_tokens"] = mails['none_stopwords'].apply(lambda x: stemming(x))
mails["lemmatized_tokens"] = mails["stemmed_tokens"].apply(lambda x: lemmatizer(x))

In [61]:
mails.head(10)

Unnamed: 0,message,label,text_tokenized,none_stopwords,stemmed_tokens,lemmatized_tokens
0,gary production from the high island larger b...,ham,"[gary, production, from, the, high, island, la...","[gary, production, high, island, larger, block...","[gari, product, high, island, larger, block, 1...","[gari, product, high, island, larger, block, 1..."
1,calpine daily gas nomination 1 doc,ham,"[, calpine, daily, gas, nomination, 1, doc]","[, calpine, daily, gas, nomination, 1, doc]","[, calpin, daili, ga, nomin, 1, doc]","[, calpin, daili, ga, nomin, 1, doc]"
2,fyi see note below already done stella ...,ham,"[fyi, see, note, below, already, done, stella,...","[fyi, see, note, already, done, stella, forwar...","[fyi, see, note, alreadi, done, stella, forwar...","[fyi, see, note, alreadi, done, stella, forwar..."
3,fyi forwarded by lauri a...,ham,"[fyi, forwarded, by, lauri, a, allen, hou, ect...","[fyi, forwarded, lauri, allen, hou, ect, 12, 1...","[fyi, forward, lauri, allen, hou, ect, 12, 14,...","[fyi, forward, lauri, allen, hou, ect, 12, 14,..."
4,jackie since the inlet to 3 river plant is shu...,ham,"[jackie, since, the, inlet, to, 3, river, plan...","[jackie, since, inlet, 3, river, plant, shut, ...","[jacki, sinc, inlet, 3, river, plant, shut, 10...","[jacki, sinc, inlet, 3, river, plant, shut, 10..."
5,george i need the following done jan 13zero ou...,ham,"[george, i, need, the, following, done, jan, 1...","[george, need, following, done, jan, 13zero, 0...","[georg, need, follow, done, jan, 13zero, 012, ...","[georg, need, follow, done, jan, 13zero, 012, ..."
6,fyi forwarded by gary l p...,ham,"[fyi, forwarded, by, gary, l, payne, hou, ect,...","[fyi, forwarded, gary, l, payne, hou, ect, 12,...","[fyi, forward, gari, l, payn, hou, ect, 12, 14...","[fyi, forward, gari, l, payn, hou, ect, 12, 14..."
7,there are two fields of gas that i am having d...,ham,"[there, are, two, fields, of, gas, that, i, am...","[two, fields, gas, difficulty, unifysystem, 1,...","[two, field, ga, difficulti, unifysystem, 1, c...","[two, field, ga, difficulti, unifysystem, 1, c..."
8,thanks so much for the memo i would like to r...,ham,"[thanks, so, much, for, the, memo, i, would, l...","[thanks, much, memo, would, like, reiterate, s...","[thank, much, memo, would, like, reiter, suppo...","[thank, much, memo, would, like, reiter, suppo..."
9,the purpose of the email is to recap the kicko...,ham,"[the, purpose, of, the, email, is, to, recap, ...","[purpose, email, recap, kickoff, meeting, held...","[purpos, email, recap, kickoff, meet, held, ye...","[purpos, email, recap, kickoff, meet, held, ye..."


In [62]:
body_text_lenth = pd.DataFrame(columns=['raw', 'clean'])
body_text_lenth['raw'] = mails['text_tokenized'].apply(lambda x: len(x))
body_text_lenth['clean'] = mails['lemmatized_tokens'].apply(lambda x: len(x))

In [63]:
body_text_lenth.head()

Unnamed: 0,raw,clean
0,409,339
1,7,7
2,159,129
3,151,110
4,95,59
