In [29]:
import os
import codecs
import random
import nltk
from nltk import word_tokenize
from nltk import NaiveBayesClassifier, classify
from nltk.text import Text

In [18]:
def read_in(folder):
    files = os.listdir(folder)
    a_list = []
    for a_file in files:
        if not a_file.startswith("."):
            f = codecs.open(folder + a_file, "r", encoding = "ISO-8859-1", errors="ignore")
            a_list.append(f.read())
            f.close()
    return a_list

In [19]:
spam_list = read_in("spam/")
print(len(spam_list))
print(spam_list[0])
ham_list = read_in("ham/")
print(len(ham_list))
print(ham_list[0])

1500
Subject: what up,, your cam babe
What are you looking for?
If your looking for a companion for friendship, love, a date, or just good ole'
Fashioned * * * * * *, then try our brand new site; it was developed and created
To help anyone find what they' re looking for. A quick bio form and you' re
On the road to satisfaction in every sense of the word.... No matter what
That may be!
Try it out and youll be amazed.
Have a terrific time this evening
Copy and pa ste the add. Ress you see on the line below into your browser to come to the site.
Http:// www. Meganbang. Biz/bld/acc /
No more plz
Http:// www. Naturalgolden. Com/retract /
Counterattack aitken step preemptive shoehorn scaup. Electrocardiograph movie honeycomb. Monster war brandywine pietism byrne catatonia. Encomia lookup intervenor skeleton turn catfish.

3672
Subject: ena sales on hpl
Just to update you on this project' s status:
Based on a new report that scott mills ran for me from sitara, I have come up
With the followin

In [20]:
#Combine ham and spam emails into tuples

all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]

In [21]:
#shuffle the data
random.seed(42)
random.shuffle(all_emails)
print (f"Dataset size = {str(len(all_emails))} emails")

Dataset size = 5172 emails


In [24]:
#preprocessing
#get lower-case, tokenize and remove stop words

def get_features(text): 
    features = {}
    word_list = [word for word in word_tokenize(text.lower())]
    for word in word_list:
        features[word] = True
    return features

all_features = [(get_features(email), label) for (email, label) in all_emails]

print(get_features("Participate In Our New Lottery NOW!"))

print(len(all_features))
print(len(all_features[0][0])) #very first email and # of features
print(len(all_features[199][0])) #200th email

{'participate': True, 'in': True, 'our': True, 'new': True, 'lottery': True, 'now': True, '!': True}
5172
38
154


In [26]:
#Split into 805train and test and train the classification algorithm

def train(features, proportion):
    train_size = int(len(features) * proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print (f"Training set size = {str(len(train_set))} emails")
    print (f"Test set size = {str(len(test_set))} emails")
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier


In [27]:
train_set, test_set, classifier = train(all_features, 0.8)

Training set size = 4137 emails
Test set size = 1035 emails


In [28]:
#Evaluate performance

def evaluate(train_set, test_set, classifier):
    # check how the classifier performs on the training and test sets
    print (f"Accuracy on the training set = {str(classify.accuracy(classifier, train_set))}")
    print (f"Accuracy on the test set = {str(classify.accuracy(classifier, test_set))}")    
    # check which words are most informative for the classifier (feature importance: top 50)
    classifier.show_most_informative_features(50)

evaluate(train_set, test_set, classifier)

Accuracy on the training set = 0.9615663524292966
Accuracy on the test set = 0.936231884057971
Most Informative Features
               forwarded = True              ham : spam   =    200.5 : 1.0
                    2004 = True             spam : ham    =    148.6 : 1.0
                     nom = True              ham : spam   =    125.8 : 1.0
                    pain = True             spam : ham    =    103.6 : 1.0
                    spam = True             spam : ham    =     92.4 : 1.0
                  health = True             spam : ham    =     81.1 : 1.0
                     sex = True             spam : ham    =     79.5 : 1.0
                     ect = True              ham : spam   =     75.7 : 1.0
              nomination = True              ham : spam   =     74.8 : 1.0
                   super = True             spam : ham    =     74.7 : 1.0
                featured = True             spam : ham    =     73.1 : 1.0
                creative = True             spam : ham

In [30]:
#check context

def concordance(data_list, search_word):
    for email in data_list:
        word_list = [word for word in word_tokenize(email.lower())]
        text_list = Text(word_list)
        if search_word in word_list:
            text_list.concordance(search_word)


print ("STOCKS in HAM:")
concordance(ham_list, "stocks")
print ("\n\nSTOCKS in SPAM:")
concordance(spam_list, "stocks")

STOCKS in HAM:
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ad my portfolio is diversified into stocks that have lost even more money than


STOCKS in SPAM:
Displaying 3 of 3 matches:
report reveals this smallcap rocket stocks newsletter first we would like to s
his email pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
Displaying 3 of 3 matches:
might occur . as with many microcap stocks , today ' s company has additional 
is emai | pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this emai | . none 
Displaying 6 of

Displaying 1 of 1 matches:
the | ast 12 months , many of these stocks made triple and even quadruple retu
Displaying 1 of 1 matches:
cautions that small and micro - cap stocks are high - risk investments and tha
Displaying 4 of 4 matches:
k tuesday some of these littie voip stocks have been reaily moving lateiy . an
 statements . as with many microcap stocks , today ' s company has additional 
is report pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this report . none 
Displaying 2 of 2 matches:
ck monday some of these little voip stocks have been realiy moving lately . an
 one trade monday ! go ypil . penny stocks are considered highiy specuiative a
Displaying 2 of 2 matches:
 % on regular price we have massive stocks of drugs for same day dispatch fast
e do have the lowest price and huge stocks ready for same - day dispatch . two
Displaying 2 of 2 matches:
his email pertaining to investing , stocks , sec

In [31]:
#Predict

test_spam_list = ["Participate in our new lottery!", "Try out this new medicine"]
test_ham_list = ["See the minutes from the last meeting attached", 
                 "Investors are coming to our office on Monday"]

test_emails = [(email_content, "spam") for email_content in test_spam_list]
test_emails += [(email_content, "ham") for email_content in test_ham_list]

new_test_set = [(get_features(email), label) for (email, label) in test_emails]

evaluate(train_set, new_test_set, classifier)

Accuracy on the training set = 0.9615663524292966
Accuracy on the test set = 1.0
Most Informative Features
               forwarded = True              ham : spam   =    200.5 : 1.0
                    2004 = True             spam : ham    =    148.6 : 1.0
                     nom = True              ham : spam   =    125.8 : 1.0
                    pain = True             spam : ham    =    103.6 : 1.0
                    spam = True             spam : ham    =     92.4 : 1.0
                  health = True             spam : ham    =     81.1 : 1.0
                     sex = True             spam : ham    =     79.5 : 1.0
                     ect = True              ham : spam   =     75.7 : 1.0
              nomination = True              ham : spam   =     74.8 : 1.0
                   super = True             spam : ham    =     74.7 : 1.0
                featured = True             spam : ham    =     73.1 : 1.0
                creative = True             spam : ham    =     71.5

In [32]:
#list predcited labels:


for email in test_spam_list:
    print (email)
    print (classifier.classify(get_features(email)))
for email in test_ham_list:
    print (email)
    print (classifier.classify(get_features(email)))

Participate in our new lottery!
spam
Try out this new medicine
spam
See the minutes from the last meeting attached
ham
Investors are coming to our office on Monday
ham


In [33]:
#Interactive:

while True:
    email = input("Type in your email here (or press 'Enter'): ")
    if len(email)==0:
        break
    else: 
        prediction = classifier.classify(get_features(email))
        print (f"This email is likely {prediction}\n")

Type in your email here (or press 'Enter'): Claim your inheritance@
This email is likely spam

Type in your email here (or press 'Enter'): Claim your inheritance!
This email is likely spam

Type in your email here (or press 'Enter'): Congratulations!! Pay raise!
This email is likely spam

Type in your email here (or press 'Enter'): Next meeting at 7am!!!
This email is likely ham

Type in your email here (or press 'Enter'): 
