In [1]:
# Import Statements
import pandas as pd
import numpy as np
from os import walk
from os.path import join
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

In [2]:
# File paths

Spam_1_path = 'spam'
Spam_2_path = 'spam_2'
Ham_1_path = 'easy_ham'
Ham_2_path = 'easy_ham_2'
full_email_data_csv = 'full_email_data.csv'
word_id_file = 'word_id_file.csv'
training_data_file = 'train-data.txt'
testing_data_file = 'test-data.txt'
test_file = 'test_spam.txt'
top_words_size = 4000
top_words_file = 'topwords.csv'
sparse_train_file = 'sparse_train_grouped.txt'
sparse_test_file = 'sparse_test_grouped.txt'




# Getting the email body

In [5]:
# Function that gets the email body 

def get_email_body(path):
    for root, dirnames, filenames in walk(path):
        for filename in filenames:
            file_path = join(root, filename)
            file = open(file_path, encoding='latin-1')
            is_body = False
            lines = []
            for line in file:
                if is_body:
                    lines.append(line)
                elif line=='\n':
                    is_body = True
            email_body = '\n'.join(lines)
            yield email_body, filename

    

# Make Dataframe with classification, email body and filename

In [6]:
def make_df(path, classification):
    names_of_files = []
    file_data = []
    for email_body, filename in get_email_body(path):
        file_data.append({'EMAIL_BODY': email_body, 'CLASSIFICATION': classification})
        names_of_files.append(filename)
    df_filedata = pd.DataFrame(file_data, index=names_of_files )
    return df_filedata
        


In [7]:
# Make dataframe from all the email (spam and nonspam)
spam_df = make_df(Spam_1_path, 1)
spam_df = spam_df.append(make_df(Spam_2_path, 1))
ham_df = make_df(Ham_1_path, 0)
ham_df = ham_df.append(make_df(Ham_2_path, 0))

In [8]:
full_email_df = pd.concat([spam_df, ham_df])

In [9]:
full_email_df.shape

(5800, 2)

In [10]:
full_email_df.head()

Unnamed: 0,EMAIL_BODY,CLASSIFICATION
00249.5f45607c1bffe89f60ba1ec9f878039a,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",1
00373.ebe8670ac56b04125c25100a36ab0510,ATTENTION: This is a MUST for ALL Computer Use...,1
00214.1367039e50dc6b7adb0f2aa8aba83216,This is a multi-part message in MIME format.\n...,1
00210.050ffd105bd4e006771ee63cabc59978,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,1
00033.9babb58d9298daa2963d4f514193d7d6,This is the bottom line. If you can GIVE AWAY...,1


# Remove files that do not contain email body

In [11]:
# Removing files that are not emails
full_email_df[full_email_df.EMAIL_BODY.str.len() == 0].index
full_email_df.drop(['cmds'], inplace=True)

In [12]:
full_email_df[full_email_df.EMAIL_BODY.str.len() == 0].index

Index([], dtype='object')

# Generate document id

In [13]:
document_id = list(range(0,len(full_email_df.index)))

In [14]:
document_id

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [15]:
# Adding document ID and File name to the dataframe
full_email_df['Document_ID'] = document_id
full_email_df['File_Name'] = full_email_df.index

In [16]:
# Setting document ID as the index of the dataframe
full_email_df = full_email_df.set_index('Document_ID')

# Probability of Spam email in the full dataset

In [139]:

prior = (full_email_df.CLASSIFICATION == 1).sum() / full_email_df.shape[0]

full_email_df

Unnamed: 0_level_0,EMAIL_BODY,CLASSIFICATION,File_Name
Document_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",1,00249.5f45607c1bffe89f60ba1ec9f878039a
1,ATTENTION: This is a MUST for ALL Computer Use...,1,00373.ebe8670ac56b04125c25100a36ab0510
2,This is a multi-part message in MIME format.\n...,1,00214.1367039e50dc6b7adb0f2aa8aba83216
3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,1,00210.050ffd105bd4e006771ee63cabc59978
4,This is the bottom line. If you can GIVE AWAY...,1,00033.9babb58d9298daa2963d4f514193d7d6
...,...,...,...
5791,"I'm one of the 30,000 but it's not working ver...",0,00609.dd49926ce94a1ea328cce9b62825bc97
5792,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,0,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",0,01127.841233b48eceb74a825417d8d918abf8
5794,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",0,01178.5c977dff972cd6eef64d4173b90307f0


In [18]:
# Exporting dataframe as a CSV file
full_email_df.to_csv(full_email_data_csv)

In [19]:
soup = BeautifulSoup()


stemmer = PorterStemmer()

# Function for Cleaning the email body
def clean_email_body(email_body):
    # List of filtered words, initially empty
    filtered_words = []
    
    soup = BeautifulSoup(email_body, 'html.parser')
    stop_words = stopwords.words('english')
    no_html_email = soup.getText().lower()
    words_in_body = word_tokenize(no_html_email)

    for word in words_in_body:
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
    return filtered_words


#### Testing the clean body function

In [20]:
msd = 'HEY SUMAN. how is your life going, I am doing fine here. To be or not to be. I am happy and doing and walking well and with pleasure here '

In [21]:
clean_email_body(msd)

['hey', 'suman', 'life', 'go', 'fine', 'happi', 'walk', 'well', 'pleasur']

In [22]:
%%time
nested_series = full_email_df.EMAIL_BODY.apply(clean_email_body)







" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


CPU times: user 57 s, sys: 1.66 s, total: 58.6 s
Wall time: 1min 26s


In [23]:
nested_series

Document_ID
0       [dear, homeown, interest, rate, lowest, point,...
1       [attent, must, comput, user, packag, deal, nor...
2       [messag, mime, format, dare, tri, find, better...
3       [import, inform, new, domain, name, final, ava...
4       [bottom, line, give, away, cd, free, peopl, li...
                              ...                        
5791    [one, work, well, week, te, updat, server, syn...
5792    [damien, morton, quot, approv, html, abl, say,...
5793    [mon, che, wrote, that, correct, line, ad, rep...
5794    [upon, time, manfr, wrote, would, like, instal...
5795    [run, pick, use, new, ftoc, button, show, mess...
Name: EMAIL_BODY, Length: 5796, dtype: object

In [24]:
doc_ids_spam = full_email_df[full_email_df.CLASSIFICATION == 1].index

In [25]:
doc_ids_spam

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895],
           dtype='int64', name='Document_ID', length=1896)

In [26]:
doc_ids_ham = full_email_df[full_email_df.CLASSIFICATION == 0].index

In [32]:
nested_ham_series = nested_series.loc[doc_ids_ham]

In [33]:
nested_spam_series = nested_series.loc[doc_ids_spam]


In [34]:
# Generate the words from the list

In [40]:
list_of_words = [item for subseries in nested_series for item in subseries]


# Get all the unique words and use .value_counts() to get their occurences

In [41]:
unique_words = pd.Series(list_of_words).value_counts()

In [112]:
unique_words

http               10662
use                 5017
list                4852
email               4370
get                 4187
                   ...  
heater                 1
consumerprotect        1
anton                  1
palac                  1
efit                   1
Length: 27305, dtype: int64

# Top 4000 words

In [113]:
frequent_words = unique_words[:4000]
print('Most common words')
print(frequent_words[:10])

Most common words
http     10662
use       5017
list      4852
email     4370
get       4187
mail      3985
one       3905
free      3171
time      3090
work      2880
dtype: int64


# Top 4000 words' dataframe

In [45]:
word_id = list(range(0,top_words_size))
top_4k = pd.DataFrame({'Top_Words': frequent_words.index.values}, index=word_id)
top_4k.index.name='Word_ID'

In [46]:
top_4k

Unnamed: 0_level_0,Top_Words
Word_ID,Unnamed: 1_level_1
0,http
1,use
2,list
3,email
4,get
...,...
3995,bite
3996,horror
3997,consent
3998,olson


In [47]:
# Save the top words dataframe as a csv file
top_4k.to_csv(top_words_file, index_label=top_4k.index.name, header=top_4k.Top_Words.name)

# Generating a full matrix (Contains 0 where the word does not occur)

## Dataframe from Nested Series where each word has a column
### Convert the nested Series (type: Series) into a python list and turn it into a pandas dataframe

In [51]:

word_columns_df = pd.DataFrame.from_records(nested_series.to_list())

In [52]:
word_columns_df.index.name='Document_ID'
word_columns_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
Document_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,dear,homeown,interest,rate,lowest,point,year,help,find,best,...,,,,,,,,,,
1,attent,must,comput,user,packag,deal,norton,systemwork,softwar,suit,...,,,,,,,,,,
2,messag,mime,format,dare,tri,find,better,annuiti,guarante,year,...,,,,,,,,,,
3,import,inform,new,domain,name,final,avail,gener,public,discount,...,,,,,,,,,,
4,bottom,line,give,away,cd,free,peopl,like,one,month,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5791,one,work,well,week,te,updat,server,sync,messag,jordan,...,,,,,,,,,,
5792,damien,morton,quot,approv,html,abl,say,feel,rockal,time,...,,,,,,,,,,
5793,mon,che,wrote,that,correct,line,ad,repositori,instal,still,...,,,,,,,,,,
5794,upon,time,manfr,wrote,would,like,instal,rpm,tri,get,...,,,,,,,,,,


In [53]:
word_columns_df.shape

(5796, 7671)

# Split data with SCIKIT learn into train and test dataset

In [86]:
X_train, X_test, y_train, y_test = train_test_split(word_columns_df, full_email_df.CLASSIFICATION, test_size=0.3, random_state=42)

In [87]:
X_test.shape

(1739, 7671)

In [88]:
X_train.index.name=X_test.index.name = 'Document_ID'
X_train

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
Document_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4844,thu,jul,rodent,unusu,size,wrote,appli,one,three,order,...,,,,,,,,,,
4727,well,would,vaniti,list,forward,least,littl,pure,data,week,...,,,,,,,,,,
5022,world,wide,word,issu,saturday,august,sent,saturday,subscrib,least,...,,,,,,,,,,
3504,believ,spamassassin,maintain,scheme,wherebi,corpu,distribut,ie,sever,peopl,...,,,,,,,,,,
3921,sorri,think,send,realiz,list,sender,usual,anyawi,ask,harri,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,anyon,els,experi,http,also,seem,bug,tracker,sf,use,...,,,,,,,,,,
5191,glm,rest,premis,fundament,differ,glm,critter,us,thu,glm,...,,,,,,,,,,
5226,thu,aug,matthia,saou,wrote,none,build,part,export,cv,...,,,,,,,,,,
5390,good,day,zenn,fri,aug,zenn,wrote,see,valid,need,...,,,,,,,,,,


In [89]:
y_train.head()

Document_ID
4844    0
4727    0
5022    0
3504    0
3921    0
Name: CLASSIFICATION, dtype: int64

# Creating Sparse matrix for the training 
# A sparse matrix is used to check the number of times the words occur (Without the zero values)

In [120]:
word_index = pd.Index(top_4k.Top_Words)
word_index.shape

(4000,)

In [92]:
word_index.get_loc('http')

0

In [127]:
# Function to make a sparse matrix
def make_sparse_matrix(df, word_index, classification):
    """
    df : Full Matrix with the Document ID and the words : X_train or X_test
    word_index = Contains the most occuring 4000 words. And gives the index of the word with .get_loc()
    classification : Contains the info about which column in df is spam or non-spam. 0 for spam, 1 for non-spam.: y_train or y_test
    
    returns: a pandas dataframe with words that occur in the top 4000 wordset with their document Id, classification and occurence as 1 for all.
    
    """
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    dict_list = []
    for i in range(nr_rows):
        for j in range(nr_cols):
            word = df.iat[i,j]
           
            if word in word_index:
                doc_id = df.index[i]
                word_id = word_index.get_loc(word)
                category = classification.at[doc_id]
                item = {'Classification': category, 'Document_ID': doc_id, 'Word_ID': word_id, 'Occurence': 1}
                dict_list.append(item)
    

    return pd.DataFrame(dict_list)
    


In [125]:
sparse_train_matrix = make_sparse_matrix(X_train, word_index, y_train)

In [126]:
sparse_train_matrix

Unnamed: 0,Classification,Document_ID,Word_ID,Occurence
0,0,4844,395,1
1,0,4844,495,1
2,0,4844,2408,1
3,0,4844,496,1
4,0,4844,37,1
...,...,...,...,...
477446,1,860,126,1
477447,1,860,650,1
477448,1,860,517,1
477449,1,860,1391,1


In [96]:
# Combining occurences with the pandas groupby method
sparse_train_grouped = sparse_train_matrix.groupby(['Document_ID', 'Word_ID', 'Classification']).sum()
sparse_train_grouped.reset_index(inplace=True)

In [134]:
sparse_train_grouped[sparse_train_grouped.Occurence == 2]

Unnamed: 0,Document_ID,Word_ID,Classification,Occurence
0,0,0,1,2
15,0,105,1,2
21,0,222,1,2
44,1,0,1,2
46,1,3,1,2
...,...,...,...,...
286938,5794,16,0,2
286973,5794,458,0,2
287004,5795,15,0,2
287033,5795,1545,0,2


In [98]:
# Save sparse_train_grouped as a csv file

In [99]:
sparse_train_grouped.to_csv(sparse_train_file )

In [102]:
# Sparse Matrix for test data

In [103]:
sparse_test_matrix = make_sparse_matrix(X_test, word_index, y_test)

In [104]:
# Combining occurences with the pandas groupby method
sparse_test_grouped = sparse_test_matrix.groupby(['Document_ID', 'Word_ID', 'Classification']).sum()
sparse_test_grouped.reset_index(inplace=True)

In [105]:
sparse_test_grouped

Unnamed: 0,Document_ID,Word_ID,Classification,Occurence
0,8,7,1,1
1,8,8,1,1
2,8,12,1,1
3,8,19,1,2
4,8,26,1,1
...,...,...,...,...
119469,5793,1129,0,1
119470,5793,1266,0,1
119471,5793,2006,0,1
119472,5793,3258,0,1


## Save the sparse data for X_train and X_test

In [110]:
np.savetxt(sparse_test_file, sparse_test_grouped, fmt='%d')

In [111]:
np.savetxt(sparse_train_file, sparse_train_grouped, fmt='%d')