## Step 1
### Process Text

In [1]:
# pass the absolute path of the lexicon file to this program
# example call:
# nancymacpath = 
#    "/Users/njmccrac/AAAdocs/research/subjectivitylexicon/hltemnlp05clues/subjclueslen1-HLTEMNLP05.tff"
# SL = readSubjectivity(nancymacpath
dirPath = '/Users/wa3/Syracuse/Term6/NLP/Final/FinalProjectData/EmailSpamCorpora/corpus'

In [2]:
'''
  This program shell reads email data for the spam classification problem.
  The input to the program is the path to the Email directory "corpus" and a limit number.
  The program reads the first limit number of ham emails and the first limit number of spam.
  It creates an "emaildocs" variable with a list of emails consisting of a pair
    with the list of tokenized words from the email and the label either spam or ham.
  It prints a few example emails.
  Your task is to generate features sets and train and test a classifier.

  Usage:  python classifySPAM.py  <corpus directory path> <limit number>
'''
# open python and nltk packages needed for processing
import os
import sys
import random
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist

### Load All Emails

In [3]:
# function to read spam and ham files, train and test a classifier 
def processspamham(dirPath,limitStr):
  # convert the limit argument from a string to an int
  limit = int(limitStr)

In [207]:
processspamham(dirPath, 1500)
# start lists for spam and ham email texts
limit = 1500
#allfiles = 3672
hamtexts = []
spamtexts = []
os.chdir(dirPath)
  # process all files in directory that end in .txt up to the limit
  #    assuming that the emails are sufficiently randomized
for file in os.listdir("./spam"):
    if (file.endswith(".txt")) and (len(spamtexts) < limit):
      # open file for reading and read entire file into a string
      f = open("./spam/"+file, 'r', encoding="latin-1")
      spamtexts.append (f.read())
      f.close()
for file in os.listdir("./ham"):
    if (file.endswith(".txt")) and (len(hamtexts) < limit):
      # open file for reading and read entire file into a string
      f = open("./ham/"+file, 'r', encoding="latin-1")
      hamtexts.append (f.read())
      f.close()

In [208]:
  # print number emails read
print ("Number of spam files:",len(spamtexts))
print ("Number of ham files:",len(hamtexts))

Number of spam files: 1500
Number of ham files: 1500


In [209]:
# create list of mixed spam and ham email documents as (list of words, label)
emaildocs = []
  # add all the spam
for spam in spamtexts:
    tokens = nltk.word_tokenize(spam)
    emaildocs.append((tokens, 'spam'))
  # add all the regular emails
for ham in hamtexts:
    tokens = nltk.word_tokenize(ham)
    emaildocs.append((tokens, 'ham'))

In [210]:
# randomize the list
random.Random(123).shuffle(emaildocs)

In [211]:
# print a few token lists
for email in emaildocs[0]:
    print (email)

['Subject', ':', 'accrued', 'thru', 'nov', '.', '19', 'daren', ',', 'please', 'take', 'a', 'look', 'at', 'the', 'attached', 'spreadsheet', ',', 'we', 'have', 'over', '500', ',', '000', 'mmbtu', 'as', 'unaccounted', 'for', '.', '.', '.', 'o', "'", 'neal', '3', '-', '9686']
ham


In [212]:
# get all words from all emails and put into a frequency distribution
# note lowercase, but no stemming or stopwords
all_words_list = [word for (sent,cat) in emaildocs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))

44574


#### Started with 1500 words, 2000 words = higher accuracy
4000 is peak accuracy at 96.1, 5000 = no change, 6000 starts to bring down accuracy with 95.9

In [213]:
# get the 1500 most frequently appearing keywords in the corpus
word_items = all_words.most_common(4000)
word_features = [word for (word,count) in word_items]
test_features = [word for (word,count) in emaildocs]

#### Top 50 Words in Email Corpus before Pre-Processing 

In [214]:
#word_items[0:50]
print((word_features[0:10]))
#print(test_features[0:50])

['-', '.', ',', '/', ':', 'the', 'to', 'and', 'of', 'a']


We can see that the most common words in the corpus are actually symbols and punctuation marks.  It will be interesting to see how these symbols and punctuations affect predictions of spam vs ham.  I would think that these might actually help with prediction based on my personal experience with spam emails.  It seems that spam emails in my inbox typically contain random symbols.  This will be something worth experiementing with in the future.  

### Baseline Features

In [215]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['{}'.format(word)] = (word in document_words)
    return features

In [216]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d, c) in emaildocs]

#### Code for Word Count Beginning

In [222]:
# define features (keywords) of a document for a BOW/unigram baseline
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['{}'.format(word)] = word_features.count(word)
    #word_items
    #for word in word_features:
        #features['{}'.format(word)] = word_features.count(word)
    return features

In [235]:
# get features sets for a document, including keyword features and category feature
#featuresets = [(document_features(d, all_words_list), c) for (d, c) in emaildocs]

#### Create train/test set & classifier

In [218]:
# training using naive Baysian classifier, training set is roughly 80% of the data
train_set, test_set = featuresets[600:], featuresets[:600]
classifier = nltk.NaiveBayesClassifier.train(train_set)

#### 80/20 Split of Data

In [219]:
print(len(featuresets))
print(len(train_set))
print(len(test_set))

3000
2400
600


In [220]:
# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)

0.9816666666666667

#### Evaluation Measures

In [112]:
## cross-validation ##
# this function takes the number of folds, the feature sets
# it iterates over the folds, using different sections for training and testing in turn
#   it prints the accuracy for each fold and the average accuracy at the end
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [113]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5
cross_validation_accuracy(num_folds, featuresets)

Each fold size: 600
0 0.9816666666666667
1 0.9533333333333334
2 0.9533333333333334
3 0.9366666666666666
4 0.9716666666666667
mean accuracy 0.9593333333333334


In [114]:
goldlist = []
predictedlist = []
for (features, label) in test_set:
    	goldlist.append(label)
    	predictedlist.append(classifier.classify(features))

In [115]:
# look at the first 10 examples
print(goldlist[:15])
print(predictedlist[:15])

cm = nltk.ConfusionMatrix(goldlist, predictedlist)
print(cm.pretty_format(sort_by_count=True, truncate=9))

['ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam']
['ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam']
     |   s     |
     |   p   h |
     |   a   a |
     |   m   m |
-----+---------+
spam |<312>  1 |
 ham |  10<277>|
-----+---------+
(row = reference; col = test)



In [116]:
# Function to compute precision, recall and F1 for each label
#  and for any number of labels
# Input: list of gold labels, list of predicted labels (in same order)
# Output:  prints precision, recall and F1 for each label
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)
        precision = TP / (TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

In [117]:
# call the function with our data
eval_measures(goldlist, predictedlist)

	Precision	Recall		F1
ham 	      0.965      0.996      0.981
spam 	      0.997      0.969      0.983


In [221]:
print(classifier.show_most_informative_features(10))

Most Informative Features
               forwarded = True              ham : spam   =    197.3 : 1.0
                     hou = True              ham : spam   =    186.3 : 1.0
                     nom = True              ham : spam   =    117.8 : 1.0
                     ect = True              ham : spam   =    115.3 : 1.0
                    2001 = True              ham : spam   =     71.7 : 1.0
              nomination = True              ham : spam   =     68.8 : 1.0
                     713 = True              ham : spam   =     64.9 : 1.0
            prescription = True             spam : ham    =     50.8 : 1.0
                     bob = True              ham : spam   =     48.0 : 1.0
                  farmer = True              ham : spam   =     45.8 : 1.0
None


### PreProcess With Removing Stop Words, Symbols, and Punctuation

In [119]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['http','%','$','_','#','|','&','*'
                  ,',', '.', '-', '/', ':', '``', '`'
                  , "'", "...", '--', '@', 'for','of'
                  ,'?',')','(','>','=',';','!',])

In [120]:
stop_all_words_list = [word for (sent, cat) in emaildocs for word in sent if word not in stopwords]
stop_all_words = nltk.FreqDist(stop_all_words_list)
print(len(stop_all_words))

44399


In [121]:
# get the 1500 most frequently appearing keywords in the corpus
stop_word_items = stop_all_words.most_common(4000)
stop_word_features = [word for (word,count) in stop_word_items]

In [122]:
def stop_document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['{}'.format(word)] = (word in document_words)
    return features

In [123]:
# get features sets for a document, including keyword features and category feature
stop_featuresets = [(stop_document_features(d, stop_word_features), c) for (d, c) in emaildocs]

In [124]:
# training using naive Baysian classifier, training set is roughly 80% of the data
train_set, test_set = stop_featuresets[600:], stop_featuresets[:600]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [125]:
# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)

0.9516666666666667

In [126]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5
cross_validation_accuracy(num_folds, stop_featuresets)

Each fold size: 600
0 0.9516666666666667
1 0.9333333333333333
2 0.9233333333333333
3 0.9216666666666666
4 0.9383333333333334
mean accuracy 0.9336666666666668


In [127]:
goldlist = []
predictedlist = []
for (features, label) in test_set:
    	goldlist.append(label)
    	predictedlist.append(classifier.classify(features))

In [128]:
# look at the first 30 examples
print(goldlist[:30])
print(predictedlist[:30])

cm = nltk.ConfusionMatrix(goldlist, predictedlist)
print(cm.pretty_format(sort_by_count=True, truncate=9))

['ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam']
['ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam']
     |   s     |
     |   p   h |
     |   a   a |
     |   m   m |
-----+---------+
spam |<296> 17 |
 ham |  12<275>|
-----+---------+
(row = reference; col = test)



In [129]:
# Function to compute precision, recall and F1 for each label
#  and for any number of labels
# Input: list of gold labels, list of predicted labels (in same order)
# Output:  prints precision, recall and F1 for each label
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)
        precision = TP / (TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

In [130]:
# call the function with our data
eval_measures(goldlist, predictedlist)

	Precision	Recall		F1
ham 	      0.958      0.942      0.950
spam 	      0.946      0.961      0.953


Using 1500 words = 94.5 accuracy; trying 4000 = 95.1; 2000 = 95

In [131]:
print(classifier.show_most_informative_features(10))

Most Informative Features
               forwarded = True              ham : spam   =    197.3 : 1.0
                     hou = True              ham : spam   =    186.3 : 1.0
                     nom = True              ham : spam   =    117.8 : 1.0
                     ect = True              ham : spam   =    115.3 : 1.0
                    2001 = True              ham : spam   =     71.7 : 1.0
              nomination = True              ham : spam   =     68.8 : 1.0
                     713 = True              ham : spam   =     64.9 : 1.0
            prescription = True             spam : ham    =     50.8 : 1.0
                     bob = True              ham : spam   =     48.0 : 1.0
                  farmer = True              ham : spam   =     45.8 : 1.0
None


### Bigrams with Unigrams

In [135]:
# set up for using bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [136]:
# create the bigram finder on all the words in sequence
print(all_words_list[:50])
finder = BigramCollocationFinder.from_words(all_words_list)

['Subject', ':', 'accrued', 'thru', 'nov', '.', '19', 'daren', ',', 'please', 'take', 'a', 'look', 'at', 'the', 'attached', 'spreadsheet', ',', 'we', 'have', 'over', '500', ',', '000', 'mmbtu', 'as', 'unaccounted', 'for', '.', '.', '.', 'o', "'", 'neal', '3', '-', '9686', 'Subject', ':', 'dewbre', 'petroleum', 'vance', ',', 'the', 'following', 'deal', 'is', 'not', 'on', 'you']


In [137]:
# define the top 500 bigrams using the chi squared measure
bigram_features = finder.nbest(bigram_measures.chi_sq, 1000)
print(bigram_features[:50])
print(len(bigram_features))

[('\x11', '63884831'), ('0184', 'bowy'), ('04005', 'jnkex'), ('04607001', 'easttrans'), ('0920', '8774937918'), ('0986315', 'shawnee'), ('0986511', 'huff'), ('1010', 'liverpool'), ('12278', 'hnbgd'), ('126360', '6742'), ('133168', 'gsf'), ('1361', 'comiats'), ('139067', 'danex'), ('1399', 'matagorda'), ('1447', 'bingolineprocessing'), ('17611', 'loring'), ('1767', 'clemmons'), ('17832', 'andl'), ('19436', 'auburndale'), ('19707', 'rackingham'), ('2073256', 'aebdb'), ('2086', 'slkhta'), ('2286', 'pxlsy'), ('238', '3164'), ('2392', '222215'), ('24679', 'coa'), ('2900', 'wilcrest'), ('3066', 'implications'), ('327964', '9844'), ('334', 'ndwsmkrsvpjlfygao'), ('357461850', '760844164'), ('361', '689894'), ('378232683', '15867384'), ('384', 'gosj'), ('3881', 'uosda'), ('4005', 'belen'), ('40937', 'bedford'), ('4120', '93883'), ('4135576', 'eaa'), ('415086', 'hlu'), ('416501', 'bvlmhht'), ('4234', 'frvklspqwhwrcu'), ('44000', 'nantes'), ('4501', 'shetland'), ('452601', 'pringle'), ('45337672'

In [138]:
# define features that include words as before 
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [139]:
# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in emaildocs]

In [140]:
# train a classifier and report accuracy
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.97

In [141]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5
cross_validation_accuracy(num_folds, bigram_featuresets)

Each fold size: 600
0 0.9816666666666667
1 0.9533333333333334
2 0.9533333333333334
3 0.9366666666666666
4 0.9716666666666667
mean accuracy 0.9593333333333334


In [142]:
# Function to compute precision, recall and F1 for each label
#  and for any number of labels
# Input: list of gold labels, list of predicted labels (in same order)
# Output:  prints precision, recall and F1 for each label
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)
        precision = TP / (TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

In [143]:
goldlist = []
predictedlist = []
for (features, label) in test_set:
    	goldlist.append(label)
    	predictedlist.append(classifier.classify(features))

In [144]:
# look at the first 30 examples
print(goldlist[:30])
print(predictedlist[:30])

cm = nltk.ConfusionMatrix(goldlist, predictedlist)
print(cm.pretty_format(sort_by_count=True, truncate=9))

['ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam']
['ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam']
     |   s     |
     |   p   h |
     |   a   a |
     |   m   m |
-----+---------+
spam |<493>  8 |
 ham |  22<477>|
-----+---------+
(row = reference; col = test)



In [145]:
# call the function with our data
eval_measures(goldlist, predictedlist)

	Precision	Recall		F1
ham 	      0.956      0.984      0.970
spam 	      0.984      0.957      0.970


In [553]:
#print(bclassifier.show_most_informative_features(50))

### Bigrams Only

In [146]:
def bigram_document_features(document, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [147]:
# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(d, bigram_features), c) for (d, c) in emaildocs]

In [148]:
# train a classifier and report accuracy
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.499

In [566]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5
cross_validation_accuracy(num_folds, bigram_featuresets)

Each fold size: 1034
0 0.7088974854932302
1 0.688588007736944
2 0.730174081237911
3 0.718568665377176
4 0.7040618955512572
mean accuracy 0.7100580270793037


In [617]:
goldlist = []
predictedlist = []
for (features, label) in test_set:
    	goldlist.append(label)
    	predictedlist.append(classifier.classify(features))

In [618]:
cm = nltk.ConfusionMatrix(goldlist, predictedlist)
print(cm.pretty_format(sort_by_count=True, truncate=9))

     |       s |
     |   h   p |
     |   a   a |
     |   m   m |
-----+---------+
 ham |<708>  . |
spam | 292  <.>|
-----+---------+
(row = reference; col = test)



In [620]:
# call the function with our data
eval_measures(goldlist, predictedlist)

ZeroDivisionError: division by zero

In [619]:
print(classifier.show_most_informative_features(10))

Most Informative Features
            B__63884831 = False             ham : spam   =      1.0 : 1.0
         B_118532_101473 = False            spam : ham    =      1.0 : 1.0
    B_bwbllfxg_paprcfwzm = False            spam : ham    =      1.0 : 1.0
       B_amoy_shrewsbury = False            spam : ham    =      1.0 : 1.0
          B_cdexeg_fcenz = False            spam : ham    =      1.0 : 1.0
B_boron_clemsonprimordial = False            spam : ham    =      1.0 : 1.0
          B_ckpiw_eqgslv = False            spam : ham    =      1.0 : 1.0
             B_2578_5767 = False            spam : ham    =      1.0 : 1.0
          B_985853_riser = False            spam : ham    =      1.0 : 1.0
          B_bluebush_nod = False            spam : ham    =      1.0 : 1.0
None


### Part-of-Speech Featureset

In [149]:
# create list of mixed spam and ham email documents as (list of words, label)
POSemaildocs = []
  # add all the spam
for spam in spamtexts:
    tokens = nltk.tokenize.sent_tokenize(spam)
    POSemaildocs.append((tokens, 'spam'))
  # add all the regular emails
for ham in hamtexts:
    tokens = nltk.tokenize.sent_tokenize(ham)
    POSemaildocs.append((tokens, 'ham'))

In [150]:
# randomize the list
random.Random(123).shuffle(POSemaildocs)

In [151]:
# print a few token lists
for email in POSemaildocs[:1]:
    print (email)

(['Subject: accrued thru nov .', '19\ndaren ,\nplease take a look at the attached spreadsheet , we have over 500 , 000 mmbtu as\nunaccounted for .', '.', '.', "o ' neal 3 - 9686"], 'ham')


In [152]:
stop_all_words_list = [word for (sent, cat) in POSemaildocs for word in sent if word not in stopwords]
stop_all_words = nltk.FreqDist(stop_all_words_list)
print(len(stop_all_words))

27740


In [153]:
# get all words from all emails and put into a frequency distribution
# note lowercase, but no stemming or stopwords
pall_words_list = [word for (sent,cat) in POSemaildocs for word in sent]
pall_words = nltk.FreqDist(all_words_list)
print(len(pall_words))

44574


In [462]:
# get the 1500 most frequently appearing keywords in the corpus
word_items = stop_all_words.most_common(2000)
word_features = [word for (word,count) in word_items]

In [165]:
# get the 1500 most frequently appearing keywords in the corpus
word_items = pall_words.most_common(4000)
word_features = [word for (word,count) in word_items]

In [166]:
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [196]:
# define feature sets using this function
POS_featuresets = [(POS_features(d, stop_all_words_list), c) for (d, c) in POSemaildocs]
# number of features for document 0
print(len(POS_featuresets[0][0].keys()))

27744


In [197]:
# the first sentence
print(POSemaildocs[1])
# the pos tag features for this sentence
print('num nouns', POS_featuresets[1][0]['nouns'])
print('num verbs', POS_featuresets[1][0]['verbs'])
print('num adjectives', POS_featuresets[1][0]['adjectives'])
print('num adverbs', POS_featuresets[1][0]['adverbs'])

(['Subject: dewbre petroleum\nvance ,\nthe following deal is not on you spreadsheet nor is it on the prebid list that daren maintains :\ndeal # counterparty meter #\n137595 dewbre petr .', '9662\nis this a good deal ?', 'bob'], 'ham')
num nouns 2
num verbs 0
num adjectives 0
num adverbs 0


In [194]:
# train and test the classifier
ptrain_set, ptest_set = POS_featuresets[600:], POS_featuresets[:600]
pclassifier = nltk.NaiveBayesClassifier.train(ptrain_set)
nltk.classify.accuracy(pclassifier, ptest_set)

0.6366666666666667

In [195]:
print(pclassifier.show_most_informative_features(10))

Most Informative Features
      contains(thanks .) = True              ham : spam   =     63.0 : 1.0
         contains(www .) = True             spam : ham    =     27.6 : 1.0
         contains(net .) = True             spam : ham    =     27.6 : 1.0
           contains(r .) = True             spam : ham    =     20.6 : 1.0
contains(http : / / www .) = True             spam : ham    =     16.2 : 1.0
          contains(html) = True             spam : ham    =     11.9 : 1.0
           contains(j .) = True              ham : spam   =     11.4 : 1.0
   contains(thank you !) = True              ham : spam   =      8.8 : 1.0
           contains(m .) = True              ham : spam   =      7.2 : 1.0
         contains(com /) = True             spam : ham    =      7.2 : 1.0
None


### Subjectivity

In [172]:
from nltk.corpus import sentence_polarity

In [173]:
def readSubjectivity(path):
    flexicon = open(path, 'r')
    # initialize an empty dictionary
    sldict = { }
    for line in flexicon:
        fields = line.split()   # default is to split on whitespace
        # split each field on the '=' and keep the second part as the value
        strength = fields[0].split("=")[1]
        word = fields[2].split("=")[1]
        posTag = fields[3].split("=")[1]
        stemmed = fields[4].split("=")[1]
        polarity = fields[5].split("=")[1]
        if (stemmed == 'y'):
            isStemmed = True
        else:
            isStemmed = False
        # put a dictionary entry with the word as the keyword
        #     and a list of the other values
        sldict[word] = [strength, posTag, isStemmed, polarity]
    return sldict

In [174]:
SLpath = "/Users/wa3/Syracuse/Term6/NLP/Week8/subjclueslen1-HLTEMNLP05.tff"

In [175]:
SL = readSubjectivity(SLpath)

In [176]:
len(SL.keys())

6885

In [177]:
def SL_features(document, SL):
    document_words = set(document)
    features = {}
   
    # count variables for the 4 classes of subjectivity
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            features['positivecount'] = weakPos + (2 * strongPos)
            features['negativecount'] = weakNeg + (2 * strongNeg)      
    return features

In [178]:
SL_featuresets = [(SL_features(d, SL), c) for (d, c) in emaildocs]

In [179]:
# show just the two sentiment lexicon features in document 0
print(SL_featuresets[0][0]['positivecount'])
print(SL_featuresets[0][0]['negativecount'])

2
0


In [180]:
# this gives the label of document 0
print(SL_featuresets[0][1])
# number of features for document 0
len(SL_featuresets[0][0].keys())

ham


2

In [181]:
# retrain the classifier using these features
sltrain_set, sltest_set = SL_featuresets[600:], SL_featuresets[:600]
slclassifier = nltk.NaiveBayesClassifier.train(sltrain_set)
nltk.classify.accuracy(slclassifier, sltest_set)

0.6616666666666666

In [183]:
print(len(SL_featuresets))
print(len(sltrain_set))
print(len(sltest_set))

3000
2400
600


In [186]:
print(slclassifier.show_most_informative_features(25))

Most Informative Features
           negativecount = 16               spam : ham    =      8.5 : 1.0
           negativecount = 15               spam : ham    =      6.5 : 1.0
           positivecount = 30               spam : ham    =      6.5 : 1.0
           negativecount = 8                spam : ham    =      6.4 : 1.0
           negativecount = 10               spam : ham    =      5.4 : 1.0
           negativecount = 12               spam : ham    =      4.7 : 1.0
           negativecount = 7                spam : ham    =      4.4 : 1.0
           positivecount = 27               spam : ham    =      4.4 : 1.0
           negativecount = 18               spam : ham    =      3.9 : 1.0
           positivecount = 37               spam : ham    =      3.7 : 1.0
           positivecount = 24               spam : ham    =      3.6 : 1.0
           positivecount = 28               spam : ham    =      3.5 : 1.0
           negativecount = 5                spam : ham    =      3.3 : 1.0

### Negation

In [198]:
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [199]:
def NOT_features(document, negationwords):
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = False
        features['V_NOT{}'.format(word)] = False
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features

In [200]:
# define the feature sets
NOT_featuresets = [(NOT_features(d, negationwords), c) for (d, c) in emaildocs]
# show the values of a couple of example features


In [201]:
train_set, test_set = NOT_featuresets[1000:], NOT_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.976

In [162]:
print(classifier.show_most_informative_features(25))

Most Informative Features
             V_forwarded = True              ham : spam   =    165.3 : 1.0
                   V_hou = True              ham : spam   =    156.7 : 1.0
                   V_ect = True              ham : spam   =     97.2 : 1.0
                   V_nom = True              ham : spam   =     96.8 : 1.0
                  V_2001 = True              ham : spam   =     59.7 : 1.0
                   V_713 = True              ham : spam   =     58.2 : 1.0
                    V_cc = True              ham : spam   =     39.9 : 1.0
                V_farmer = True              ham : spam   =     39.8 : 1.0
                   V_bob = True              ham : spam   =     39.7 : 1.0
          V_prescription = True             spam : ham    =     38.4 : 1.0
                     V_| = True             spam : ham    =     37.7 : 1.0
            V_compliance = True             spam : ham    =     35.1 : 1.0
                 V_vance = True              ham : spam   =     34.3 : 1.0

### Step 3b: SkLearn Time

In [230]:
import sys
import pandas
import numpy
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [228]:
training_set, testing_set = featuresets[600:], featuresets[:600]

In [229]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

DecisionTree_classifier = SklearnClassifier(DecisionTreeClassifier())
DecisionTree_classifier.train(training_set)
print("DecisionTree_classifier accuracy percent:", (nltk.classify.accuracy(DecisionTree_classifier, testing_set))*100)

MNB_classifier accuracy percent: 97.83333333333334
BernoulliNB_classifier accuracy percent: 98.0




LogisticRegression_classifier accuracy percent: 97.66666666666667
SGDClassifier_classifier accuracy percent: 97.5




SVC_classifier accuracy percent: 91.83333333333333
LinearSVC_classifier accuracy percent: 97.33333333333334




NuSVC_classifier accuracy percent: 96.33333333333334
DecisionTree_classifier accuracy percent: 95.16666666666667


In [408]:
goldlist = []
predictedlist = []
for (features, label) in test_set:
    	goldlist.append(label)
    	predictedlist.append(LogisticRegression_classifier.classify(features))

In [231]:
goldlist = []
predictedlist = []
for (features, label) in test_set:
    	goldlist.append(label)
    	predictedlist.append(BernoulliNB_classifier.classify(features))

In [232]:
# look at the first 30 examples
print(goldlist[:30])
print(predictedlist[:30])

cm = nltk.ConfusionMatrix(goldlist, predictedlist)
print(cm.pretty_format(sort_by_count=True, truncate=9))

['ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam']
['ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam']
     |   s     |
     |   p   h |
     |   a   a |
     |   m   m |
-----+---------+
spam |<313>  . |
 ham |  12<275>|
-----+---------+
(row = reference; col = test)



In [233]:
# Function to compute precision, recall and F1 for each label
#  and for any number of labels
# Input: list of gold labels, list of predicted labels (in same order)
# Output:  prints precision, recall and F1 for each label
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)
        precision = TP / (TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

In [234]:
# call the function with our data
eval_measures(goldlist, predictedlist)

	Precision	Recall		F1
ham 	      0.958      1.000      0.979
spam 	      1.000      0.963      0.981
