In [None]:
  import os
import pandas as pd 
import numpy as np 
import nltk
from nltk import FreqDist, word_tokenize, bigrams
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, TrigramCollocationFinder
from nltk.corpus import PlaintextCorpusReader, stopwords
from nltk.text import Text 
from sklearn.model_selection import train_test_split
#from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
sns.set(style = 'white')
sns.set(style = 'whitegrid', color_codes = True)
import random
import re

In [None]:
spamDir = 'corpus/spam' # directory of SPAM emails
hamDir = 'corpus/ham' # directory of HAM emails

# function to get absolute filepaths in a directory
# param directory: absolute directory name
# yields absolute file paths
def absoluteFilePaths(directory):
   for dirpath,_,filenames in os.walk(directory):
       for f in filenames:
           yield os.path.abspath(os.path.join(dirpath, f))

# function to do baseline processing of emails from a directory into a list of tokenized documents
# param directory: absolute directory name
# param label: either 'spam' or 'ham'
# returns texts and documents
def process(directory, label):    
    texts = [] # initialize list of strings where each string is an email
    # get list of absolute file paths in directory
    myGenerator = absoluteFilePaths(directory)
    filelist = []
    for f in myGenerator:
        filelist.append(f)
    # process all files in directory that end in .txt
    for f in filelist:
        if (f.endswith(".txt")):
            # open file for reading and read entire file into a string            
            with open(f, 'r', encoding = 'latin-1') as fin:
                texts.append(fin.read())
    documents = [] # initialize list of tuples where each element is a tokenized email with its label
    # process each email
    for text in texts:
        tokens = word_tokenize(text)
        documents.append((tokens, label))
    return texts, documents


# function to compute basic corpus statistics for either spam or ham
# param texts: a list of strings where each element is email text
# param documents: a list of tuples where the first item of each tuple is the tokenized email text
# prints corpus statistics
def getstats(texts, documents):
    text_list = [text for text in texts]
    doc_list = [doc[0] for doc in documents]
    print("Email level statistics:\n")
    # average number of characters per email
    avg_chars = int(sum([len(t) for t in text_list]) / len(text_list))
    print("Average number of characters per email: {:d}".format(avg_chars))
    # average number of words per email
    avg_words = int(sum([len(doc) for doc in doc_list]) / len(doc_list))
    print("Average number of words per email: {:d}".format(avg_words))
    # average vocabulary size per email
    avg_vocab = int(sum([len(set(doc)) for doc in doc_list]) / len(doc_list))
    print("Average vocabulary size per email: {:d}".format(avg_vocab))
    # average lexical richness per email (proportion of unique words to total words)
    avg_lex_rich = sum([len(set(doc))/len(doc) for doc in doc_list]) / len(doc_list)
    print("Average lexical richness per email: {:.2f}".format(avg_lex_rich))
    print("\nCorpus level statistics:\n")
    words = []
    for doc in doc_list:
        words.extend(doc)
    # total number of words
    print("Total number of words: {:d}".format(len(words)))
    # vocabulary size
    print("Total vocabulary size: {:d}".format(len(set(words))))
    # lexical richness
    print("Total lexical richness: {:.2f}".format(len(set(words)) / len(words)))
    # average number of characters per word
    word_lengths = [len(w) for w in words]
    print("Average number of characters per word: {:.2f}".format(sum(word_lengths) / len(word_lengths)))    

# function to extract tokens from documents
# param documents: a list of tuples where the first item of each tuple is the tokenized email text
# returns a single list of all tokens from email documents
def getTokens(documents):
    tokens = []
    for doc in documents:
        for w in doc[0]:
            tokens.append(w)
    return tokens


# function to print top n bigram frequency distribution
# param documents: a list of tuples where the first item of each tuple is the tokenized email text
# param type: 'freq' (frequency) or 'mi' (mutual information)
# param n: top n parameter for bigrams to print
# prints results
# returns scored bigrams
def getBigramDist(documents, n, type = 'freq'):
    tokens = [w.lower() for w in getTokens(documents)]
    measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokens) # scorer
    finder.apply_word_filter(alpha_filter) # exclude non-alphabetic words
    finder.apply_word_filter(lambda w: w in stopwords) # exclude stop words
    if type == 'mi':
        finder.apply_freq_filter(5) # frequency filter of greater than or equal to 5
        scored = finder.score_ngrams(measures.pmi) # distribution of mutual information
        print("Top 100 most common strongly connected bigrams out of total {:d} unique bigrams:\n".format(len(scored)))
    else:
        # distribution of frequency as proportion of the bigram count to count of all bigrams
        scored = finder.score_ngrams(measures.raw_freq)
        print("Top 100 most common bigrams out of total {:d} unique bigrams:\n".format(len(scored)))
    print([score[0] for score in scored[:n]])
    return scored

# define stopwords
from nltk.corpus import stopwords
stopwords1 = stopwords.words('english')

# function that identifies non-alphabetic tokens
# param w: string word
# returns true if word consists only of non-alphabetic characters 
def alpha_filter(w):
    # pattern to match a word of non-alphabetical characters
    pattern = re.compile('^[^a-z]+$')
    if pattern.match(w):
        return True
    else:
        return False

# function to clean a list of tokens [basic]
# param tokens: a list of strings where each element is a token
# returns a new list of cleaned tokens
def clean1(tokens):
    # convert tokens to lower-case
    tokens = [w.lower() for w in tokens]
    # remove non-alphabetic words
    tokens = [w for w in tokens if not alpha_filter(w)]    
    # remove stop words
    tokens = [w for w in tokens if not w in stopwords1]
    return tokens



In [None]:

%%time
spamTexts, spamDocs = process(spamDir, 'spam')
hamTexts, hamDocs = process(hamDir, 'ham')
print("Total number of SPAM documents read: {:d}".format(len(spamDocs)))
print("Total number of HAM documents read: {:d}".format(len(hamDocs)))

## **Exploration**
- Corpus Statistics
- Visualizations

## **Corpus Statistics**

In [None]:
getstats(spamTexts, spamDocs)
getstats(hamTexts, hamDocs)

Observations:

On a per email basis, SPAM emails are generally longer than HAM emails and contain more unique words.
SPAM emails measure higher in lexical richness than HAM emails
Actions:

Add features measuring document statistics for modeling as these seem to distinguish SPAM and HAM.

In [None]:
# inspect first 50 tokens
print(getTokens(spamDocs)[:50])


In [None]:

# inspect first 50 tokens
print(getTokens(hamDocs)[:50])

Upon examination, it is clear that both categories could mutually benefit from applying some basic cleaning transformations.

Observations:

There are non-alphabetic words in the basic tokenization
Tokens are case-sensitive, which for the purposes of SPAM detection may not be a necessity
The word 'Subject' is commonplace and may not be useful for distinguishing SPAM/HAM
Actions:

Remove non-alphabetic words
Convert tokens to lower-case
Define and add to Stopwords list words that are highly commonplace such as: 'the'

In [None]:

# do an basic cleaning of the original spam and ham tokens
spamTokens = clean1(getTokens(spamDocs))
hamTokens = clean1(getTokens(hamDocs))

print("There are {:d} SPAM tokens".format(len(spamTokens)))
print("There are {:d} HAM tokens".format(len(hamTokens)))


## **Top 50 words by frequency - SPAM¶**

In [None]:
fdistspam = FreqDist(spamTokens) # frequency distribution
print([item for item in fdistspam.most_common(50)])
fdistspam.plot(50, cumulative = True, title = "SPAM: Cumulative Frequency of Top 50 words")

## **Top 50 words by frequency - HAM**

In [None]:
fdistham = FreqDist(hamTokens) # frequency distribution
print([item for item in fdistham.most_common(50)])
fdistham.plot(50, cumulative = True, title = "HAM: Cumulative Frequency of Top 50 words")

Upon examination of the unigram frequency distributions for both SPAM and HAM, it is clear that the top 50 words for HAM make up a greater proportion of its total word frequency relative to SPAM. This was expected as SPAM exhibited to have a more extensive vocabulary in previous corpus statistics.

Observations: Distinguishing features observed for SPAM includes words like ('http', 'www'). The top 50 frequency lexicon for SPAM appears quite distinct from HAM. Distinguishing features observed for HAM includes words like 'ect' (Enron Capital and Trade) and other corporate references.

Actions:

Identify more high frequency words that are common to both SPAM and HAM

In [None]:
#Measure the Overlap of High Frequency Words Between SPAM and HAM
print("Top N Proportion of Word Overlap:\n")
top_n_list = [50, 100, 500, 1000, 2000, 3000]
for n in top_n_list:
    print(n, len(set([t[0] for t in fdistham.most_common(n)]) & set([t[0] for t in fdistspam.most_common(n)])) / n)

 **Sample of Common Words between SPAM and HAM**

In [None]:
n = 100
high_freq_common = list(set([t[0] for t in fdistham.most_common(n)]) & set([t[0] for t in fdistspam.most_common(n)]))
print("There are {:d} highly frequent words that are common to both SPAM and HAM:\n".format(len(high_freq_common)))
print(high_freq_common)



In [None]:
#Sample of Least Common Words in SPAM and HAM
print("Sample of the least common words in SPAM:\n")
print(fdistspam.hapaxes()[:50])
print("\nThere are {:d} hapaxes in the SPAM text".format(len(fdistspam.hapaxes()))

print("Sample of the least common words in HAM:\n")
print(fdistham.hapaxes()[:50])
print("\nThere are {:d} hapaxes in the HAM text".format(len(fdistham.hapaxes())))

Observations:

The top 50 - 100 words for SPAM and HAM seem to be mutually exclusive for the most part. However, just the top 50-100 words may be insufficient in classifying SPAM emails due to its large vocabulary, and unigram features for classification will likely need to be substantially extended
There are 26 highly frequent words that are common to both SPAM and HAM. For the purposes of this analysis, it might be beneficial to remove these words by adding them to the Stopwords list.
SPAM emails contain almost four times as many hapaxes (very uncommon words) compared to HAM emails. For the purposes of this analysis, it might be beneficial to remove the hapaxes by adding them to the Stopwords list.
Actions:

Define and add to Stopwords list: 'message', 'x', 'please', 'mail', 'could', 'like', 'us', 'gas', 'price', 'may', 'time', 'get', 'see', 'net', 'need', 'would', 'l', 'information', 'th', 'company', 'p', 'new', 'e', 'one', 'also', 'com'
These are high frequency words that are common to both SPAM and HAM; therefore these words will be added to Stopwords.

In [None]:
# extensive stopwords - includes high frequency words common to both labels and hapaxes (highly infrequent words)
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
addtlstopwords = ['subject'] + high_freq_common + fdistspam.hapaxes() + fdistham.hapaxes()
stopwords2 = stopwords + addtlstopwords
print("There are a total of {:d} stopwords defined".format(len(stopwords2)))

# function to clean a list of tokens [advanced]
# param tokens: a list of strings where each element is a token
# returns a new list of cleaned tokens
def clean2(tokens):
    # convert tokens to lower-case
    tokens = [w.lower() for w in tokens]
    # remove non-alphabetic words
    tokens = [w for w in tokens if not alpha_filter(w)]    
    # remove stop words
    tokens = [w for w in tokens if not w in stopwords2]
    return tokens

# **Top 100 Bigrams by Frequency - SPAM and HAM**

In [None]:
bigramSpamFreq = getBigramDist(spamDocs, n = 100, type = 'freq')

bigramHamFreq = getBigramDist(hamDocs, n = 100, type = 'freq')

## ***Top 100 Bigrams by Mutual Information ***

In [None]:
bigramSpamMI = getBigramDist(spamDocs, n = 100, type = 'mi')
bigramHamMI = getBigramDist(hamDocs, n = 100, type = 'mi')

**Typical Frequency Scores for SPAM and HAM bigrams**


Distributional statistics computed using mean and median bigram scores

In [None]:
mean_score_freq_spam = np.mean([s[1] for s in bigramSpamFreq])
median_score_freq_spam = np.median([s[1] for s in bigramSpamFreq])
print("SPAM frequency scores:\n")
print("Mean: {:.2E} Median: {:.2E}".format(mean_score_freq_spam, median_score_freq_spam))
mean_score_freq_ham = np.mean([s[1] for s in bigramHamFreq])
median_score_freq_ham = np.median([s[1] for s in bigramHamFreq])
print("\nHAM frequency scores:\n")
print("Mean: {:.2E} Median: {:.2E}".format(mean_score_freq_ham, median_score_freq_ham))

SPAM frequency scores:

Mean: 4.13E-06 Median: 2.81E-06

HAM frequency scores:

Mean: 3.47E-06 Median: 1.20E-06


In [None]:
a = [s[1] for s in bigramSpamFreq]
plt.hist(a)
plt.title("Histogram of SPAM Bigram Frequency Scores")
plt.show()


a = [s[1] for s in bigramHamFreq]
plt.hist(a)
plt.title("Histogram of HAM Bigram Frequency Scores")
plt.show()

**Typical Mutual Information Scores for SPAM and HAM bigrams**

In [None]:

mean_score_mi_spam = np.mean([s[1] for s in bigramSpamMI])
median_score_mi_spam = np.median([s[1] for s in bigramSpamMI])
print("SPAM MI scores:\n")
print("Mean: {:.2E} Median: {:.2E}".format(mean_score_mi_spam, median_score_mi_spam))
mean_score_mi_ham = np.mean([s[1] for s in bigramHamMI])
median_score_mi_ham = np.median([s[1] for s in bigramHamMI])
print("\nHAM MI scores:\n")
print("Mean: {:.2E} Median: {:.2E}".format(mean_score_mi_ham, median_score_mi_ham))

SPAM MI scores:

Mean: 1.04E+01 Median: 1.03E+01

HAM MI scores:

Mean: 8.56E+00 Median: 8.46E+00

In [None]:
a = [s[1] for s in bigramSpamMI]
plt.hist(a)
plt.title("Histogram of SPAM Bigram Mutual Information Scores")
plt.show()


a = [s[1] for s in bigramHamMI]
plt.hist(a)
plt.title("Histogram of HAM Bigram Mutual Information Scores")
plt.show()

Observations:

Bigrams for SPAM exhibited more references to retail product entities, references to selling, and non-English phrases.
Bigrams for HAM exhibited more repeated references to organization, person, and location entities.
Bigrams for SPAM were on average more frequent than HAM by a small order of magnitude; the same could be said based on mutual information scores.
Frequency scores exhibit a right-skewed distribution, whereas mutual information scores feature a more normal distribution.
Actions:

Complete lists of bigrams were extracted for modeling and feature engineering.

# **Part-of-Speech Tag Frequencies**

In [None]:
# function to print normalized frequencies of POS tags
# param documents: a list of tuples where the first item of each tuple is the tokenized email text
# prints normalized frequencies for nouns, verbs, adjectives, and adverbs
def getPosStats(documents):
    docs = documents
    # get list of tags
    pos = [t[1] for t in nltk.pos_tag(docs)]
    # aggregate class counts
    noun_count, verb_count, adj_count, adv_count = 0, 0, 0, 0
    for tag in pos:
        if tag.startswith('N'): noun_count += 1
        if tag.startswith('V'): verb_count += 1
        if tag.startswith('J'): adj_count += 1
        if tag.startswith('R'): adv_count += 1
    # normalize class counts
    noun_count_norm = noun_count / len(pos); print("Normalized Noun Frequency: {:.2f}".format(noun_count_norm))
    verb_count_norm = verb_count / len(pos); print("Normalized Verb Frequency: {:.2f}".format(verb_count_norm))
    adj_count_norm = adj_count / len(pos); print("Normalized Adjective Frequency: {:.2f}".format(adj_count_norm))
    adv_count_norm = adv_count / len(pos); print("Normalized Adverb Frequency: {:.2f}".format(adv_count_norm))

getPosStats(spamTokens)
getPosStats(hamTokens)

Observations:

At an aggregate level, both SPAM and HAM appear to be similar in their POS frequency distributions.
SPAM tokens contain slightly more nouns and adjectives, and fewer verbs.
Actions:

Include POS tag features in the modeling experimentation

## **Modeling Experiments**
*Feature Engineering
  - ngrams (bag of words)
    - unigrams -
    - bigrams
  - word statistics (lexical richness/email, # characters/email, # words/email, mean # of characters per word)
  - POS tag features
*Modeling
  - NB NLTK
  - Multinomial NB Sci-kit Learn
  - SVM Sci-kit Learn
  - Logistic Regression Sci-kit Learn


In [None]:
# combine labeled documents
print("There are {:d} SPAM documents and {:d} HAM documents".format(len(spamDocs), len(hamDocs)))
documents = spamDocs + hamDocs
print("There are a total of {:d} documents".format(len(documents)))



#Randomly shuffle the documents for training and testing classifiers
random.seed(111)
random.shuffle(documents)
print([doc[1] for doc in documents[:20]]) # demonstrate labels have been shuffled

## **Experiment 1: Testing the Application of Stopwords**
The objective of this experiment is to test whether the application of an extensive stopwords list provides for improved classification results relative to a basic stopwords list from NLTK.

The extensive stopwords list contains 28056 words and includes:

- basic stopwords
- high-frequency words that are common to both classes
- hapaxes
Features: 3000 Unigrams

Top 3000 words based on frequency varied by the stopwords applied
Classifier: Naive Bayes Classifier from NLTK

In [None]:
# function to get word features
# param documents: a list of tuples where the first item of each tuple is the tokenized email text
# param stopwords: a list of strings where each element is a stopword
# returns a list of 3000 strings where each string is a word feature
def getWordFeatures(documents, stopwords):    
    # lower-case conversion of complete document tokenization
    all_words_list = [word.lower() for (email, cat) in documents for word in email]
    # filter for alphabetic words
    all_words_list = [word for word in all_words_list if not alpha_filter(word)]
    # exclude stopwords
    keep_words = set(all_words_list) - set(stopwords)
    all_words_list = [word for word in all_words_list if word in keep_words]
    all_words = FreqDist(all_words_list)
    # get the 1500 most frequently appearing keywords in all words
    word_items = all_words.most_common(3000)
    word_features = [word for (word, count) in word_items]
    return word_features

# [feature definition function: experiment 1] function to get document features (applicable for unigrams)
# param document: a list of strings representing a tokenized email
# param word_features: a list of strings against which the tokens in document are matched
# returns a dictionary where each key is 'contains(keyword)' and is either true or false
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

# function to get feature sets for modeling in Experiment 1
def getFeatureSets1(documents, stopwords):
    # get word features based on specified stopwords
    word_features = getWordFeatures(documents, stopwords)
    featuresets = [(document_features(d, word_features), c) for (d, c) in documents]
    return featuresets

## **Feature Extraction**
- Two Unigram feature sets that either use basic or extensive stopwords lists
  - unigramBasic3000 uses 3000 word features having excluded basic stopwords (i.e. out-of-box NLTK)
  - unigramExt3000 uses 3000 word features having excluded extensive stopwords (i.e. basic stopwords, highly common keywords, hapaxes)

In [None]:
# get feature sets for documents using basic stopwords
unigramBasic3000 = getFeatureSets1(documents, stopwords1)
# split training and test sets 70/30
unigramBasic3000Train = unigramBasic3000[:3620]
unigramBasic3000Test = unigramBasic3000[3620:]

# get feature sets for documents using extensive stopwords
unigramExt3000 = getFeatureSets1(documents, stopwords2)
# split training and test sets 70/30
unigramExt3000Train = unigramExt3000[:3620]
unigramExt3000Test = unigramExt3000[3620:]

## **Naive Bayes Classifier in NLTK**

Compute Precision, Recall, F1 and Plot Confusion Matrix

In [None]:
# function to compute precision, recall, and f1 for each label and for any number of labels
# param gold: list of strings where each element is a gold label
# param predicted: list of strings where each element is a predicted label (in same order)
# output: prints precision, recall, f1 for each class
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # initialize list of class-specific scores
    precision_list, recall_list, f1_list = [],[],[]
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # formulas for precision, recall, and f1
        precision = TP / (TP + FN)
        recall = TP / (TP + FP)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append( 2 * (recall * precision) / (recall + precision))
    # the evaluation measures in a table with one row per label
    print('class\tPrecision\tRecall\tF1\n')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(f1_list[i]))
        
# function to perform cross-validation and model evaluation for experiment 1
# param num_folds: integer specifying number of iterations
# param featuresets: list containing dictionary of features and label where each element is an email
# output: prints iteration accuracy and the mean accuracy
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size, '\n')    
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)        
        print (i, 'accuracy:', accuracy_this_round)
        accuracy_list.append(accuracy_this_round)        
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)
# function to plot confusion matrix
# param gold: list of strings where each element is a gold label
# param predicted: list of strings where each element is a predicted label (in same order)
# output: plots confusion matrix with sklearn
def getCM(gold, predicted):
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(gold, predicted)
    # plot heatmap
    class_names=[0,1] # name  of classes
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(cm), annot=True, cmap = "YlGnBu", fmt = 'g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')


# function to produce precision, recall, f1, and confusion matrix
# param train: training set
# param test: test set
# output: prints precision, recall, f1, and plots a confusion matrix
def getClassScores(train, test):
    # train a classifier
    classifier = nltk.NaiveBayesClassifier.train(train)
    # get actuals and predictions
    goldlist, predictedlist = [],[]
    for (features, label) in test:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    # print evaluation measures
    eval_measures(goldlist, predictedlist)
    # plot confusion matrix
    getCM(goldlist, predictedlist)     

# **Model Evaluation: Basic and Extensive Stopwords List**

In [None]:
# 5-fold cross-validation on training set
cross_validation_accuracy(5, unigramBasic3000Train)
# train a classifier and predict test set; get evaluation metrics; plot confusion matrix
getClassScores(unigramBasic3000Train, unigramBasic3000Test)

# 5-fold cross-validation on training set
cross_validation_accuracy(5, unigramExt3000Train)
# train a classifier and predict test set; get evaluation metrics; plot confusion matrix
getClassScores(unigramExt3000Train, unigramExt3000Test)

# **Experiment 2: Testing the Effectiveness of Bigrams**

The objective of this experiment is to test whether adding bigrams provides for improved classification results relative to just utilizing unigrams.

Baseline featureset:

- Top 3000 unigrams by frequency with basic stopwords applied, as demonstrated in Experiment 1
Test featureset:

- Top 3000 unigrams by frequency with basic stopwords exclusion
- Top 1000 bigrams scored by frequency with basic stopwords exclusion
Classifier: Naive Bayes Classifier from NLTK

In [None]:
# function to get bigram features
# param documents: a list of tuples where the first item of each tuple is the tokenized email text
# param stopwords: a list of strings where each element is a stopword
# returns a list of 1000 tuples where each element is a bigram feature
def getBigramFeatures(documents, stopwords):
    # lower-case conversionof complete document tokenization
    all_words_list = [word.lower() for (email, cat) in documents for word in email]
    # Top 1000 bigram feature extraction
    measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(all_words_list) # scorer
    finder.apply_word_filter(alpha_filter) # exclude non-alphabetic words
    finder.apply_word_filter(lambda w: w in stopwords) # exclude stop words    
    scored = finder.score_ngrams(measures.raw_freq)
    bigram_features = [s[0] for s in scored[:1000]]
    return bigram_features

# [feature definition function: experiment 2] function to get document features (applicable for unigrams and bigrams)
# param document: a list of strings representing a tokenized email
# param word_features: a list of strings against which the tokens in document are matched
# param bigram_features: a list of tuples where each element is a bigram
# returns a dictionary where each key is 'contains(keyword)' and is either true or false
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
    return features

# function to get feature sets for modeling in Experiment 2
def getFeatureSets2(documents, stopwords):
    # get word features based on specified stopwords
    word_features = getWordFeatures(documents, stopwords)
    # get bigram features based on specified stopwords
    bigram_features = getBigramFeatures(documents, stopwords)
    featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]
    return featuresets

Feature Extraction
- Two feature sets that either use unigrams only or unigrams + bigrams
    - unigramBasic3000 uses 3000 word features having excluded basic stopwords (defined in Experiment 1)
    - bigramBasic4000 uses 3000 word features and 1000 bigram features having excluded basic stopwords

In [None]:
# get feature sets for documents using basic stopwords
bigramBasic4000 = getFeatureSets2(documents, stopwords1)
# split training and test sets 70/30
bigramBasic4000Train = bigramBasic4000[:3620]
bigramBasic4000Test = bigramBasic4000[3620:]

# **Model Evaluation: Unigrams**

In [None]:
# 5-fold cross-validation on training set
cross_validation_accuracy(5, unigramBasic3000Train)
# train a classifier and predict test set; get evaluation metrics; plot confusion matrix
getClassScores(unigramBasic3000Train, unigramBasic3000Test)

# **Model Evaluation: Unigrams + Bigrams**

In [None]:
# 5-fold cross-validation on training set
cross_validation_accuracy(5, bigramBasic4000Train)
# train a classifier and predict test set: get evaluation metrics; plot confusion matrix
getClassScores(bigramBasic4000Train, bigramBasic4000Test)

# **Experiment 3: Testing the Effectiveness of Part-of-Speech Tags**
The objective of this experiment is to test whether adding POS tags provides for improved classification results relative to utilizing unigrams and bigrams. In earlier exploration of POS tag frequencies for SPAM and HAM documents, it was discovered that SPAM documents generally had higher frequencies of nouns and adjectives, and lower frequencies of verbs. This was gleaned at a corpus level, but this experiment will extract features at a document level to determine if they are useful for identifying SPAM.

Baseline featureset:

- Top 3000 unigrams and top 1000 bigrams based on frequency with basic stopwords applied, as demonstrated in Experiment 2
Test featureset:

- Top 3000 unigrams by frequency with basic stopwords exclusion
- Top 1000 bigrams scored by frequency with basic stopwords exclusion
- Normalized POS tag frequency (no stopwords removed) that aggregates for nouns, verbs, adjectives, and adverbs
  - The default NLTK (Stanford) tagger will be used
  - POS tags are known to be effective in certain circumstances such as with shorter sentence-level or social media posts (e.g. tweets)
  
Classifier: Naive Bayes Classifier from NLTK

In [None]:
# [feature definition function: experiment 3] function to get document features (applicable for unigrams, bigrams, and pos tags)
# param document: a list of strings representing a tokenized email
# param word_features: a list of strings against which the tokens in document are matched
# param bigram_features: a list of tuples where each element is a bigram
# returns a dictionary where each key value is either 'contains(keyword)' and boolean or normalized frequencies by POS tags
def pos_document_features(document, word_features, bigram_features):
    document_words = set(document) # unigrams
    document_bigrams = nltk.bigrams(document) # bigrams
    document_pos = [t[1] for t in nltk.pos_tag(document)] # pos tags
    features = {}
    # unigram features
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    # bigram features
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
    # pos features
    noun_count, verb_count, adj_count, adv_count = 0, 0, 0, 0
    for tag in document_pos:
        if tag.startswith('N'): noun_count += 1
        if tag.startswith('V'): verb_count += 1
        if tag.startswith('J'): adj_count += 1
        if tag.startswith('R'): adv_count += 1
    features['noun_count_norm'] = noun_count / len(document_pos)
    features['verb_count_norm'] = verb_count / len(document_pos)
    features['adj_count_norm'] = adj_count / len(document_pos)
    features['adv_count_norm'] = adv_count / len(document_pos)
    return features    

# function to get feature sets for modeling in Experiment 2
def getFeatureSets3(documents, stopwords):
    # get word features based on specified stopwords
    word_features = getWordFeatures(documents, stopwords)
    # get bigram features based on specified stopwords
    bigram_features = getBigramFeatures(documents, stopwords)
    featuresets = [(pos_document_features(d, word_features, bigram_features), c) for (d, c) in documents]
    return featuresets

Feature Extraction
- Two feature sets that either use (unigrams + bigrams) or (unigrams + bigrams + normalized pos tag frequencies)
  - bigramBasic4000 uses 3000 word features and 1000 bigram features having excluded basic stopwords (defined in Experiment 2)
  - posBasic4000 uses 3000 word features, 1000 bigram features (as defined in Experiment 2), and 4 additional pos tag features

In [None]:
# get feature sets for documents using basic stopwords
posBasic4000 = getFeatureSets3(documents, stopwords1)
# split training and test sets 70/30
posBasic4000Train = posBasic4000[:3620]
posBasic4000Test = posBasic4000[3620:]

## **Model Evaluation: Unigrams + Bigrams**


In [None]:
# 5-fold cross-validation on training set
cross_validation_accuracy(5, bigramBasic4000Train)
# train a classifier and predict test set: get evaluation metrics; plot confusion matrix
getClassScores(bigramBasic4000Train, bigramBasic4000Test)

**Model Evaluation: Unigrams + Bigrams + POS Tags**

In [None]:
# 5-fold cross-validation on training set
cross_validation_accuracy(5, posBasic4000Train)
# train a classifier and predict test set: get evaluation metrics; plot confusion matrix
getClassScores(posBasic4000Train, posBasic4000Test)


# **Experiment 4: Additional Corpus Statistics as Features**
The objective of this experiment is to test whether adding certain corpus statistics provides for improved classification results, where the "corpus" in this context is an email. In earlier exploration it was observed that SPAM emails were generally longer than HAM emails and contain more unique words. They also measured higher in lexical richness.

Lexical richness, character count, and mean word length will be the features to include and test:

1. Lexical richness is the ratio of unique word count to total word count
Character count is the total number of characters in the email
2. Mean word length is the average number of characters per word for every word in the email
3. For the purposes of this experiment, stopwords will not be removed for the extraction of these three features.

Baseline featureset:

- 3000 unigrams, 1000 bigrams, and POS tags as defined in Experiment 3
Test featureset:

- Top 3000 unigrams by frequency with basic stopwords exclusion
- Top 1000 bigrams scored by frequency with basic stopwords exclusion
- Normalized POS tag frequency (no stopwords removed) that aggregates for nouns, verbs, adjectives, and adverbs using the default NLTK (Stanford) tagger
- Lexical richness, email character count, and mean word length

Classifier: Naive Bayes Classifier from NLTK

In [None]:

# [feature definition function: experiment 4] function to get document features
# param document: a list of strings representing a tokenized email
# param word_features: a list of strings against which the tokens in document are matched
# param bigram_features: a list of tuples where each element is a bigram
# returns a dictionary where each key value is either 'contains(keyword)', normalized frequencies by POS tags, or corpus statistics
def document_features4(document, word_features, bigram_features):
    document_words = set(document) # unigrams
    document_bigrams = nltk.bigrams(document) # bigrams
    document_pos = [t[1] for t in nltk.pos_tag(document)] # pos tags
    features = {}
    # unigram features
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    # bigram features
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
    # pos features
    noun_count, verb_count, adj_count, adv_count = 0, 0, 0, 0
    for tag in document_pos:
        if tag.startswith('N'): noun_count += 1
        if tag.startswith('V'): verb_count += 1
        if tag.startswith('J'): adj_count += 1
        if tag.startswith('R'): adv_count += 1
    features['noun_count_norm'] = noun_count / len(document_pos)
    features['verb_count_norm'] = verb_count / len(document_pos)
    features['adj_count_norm'] = adj_count / len(document_pos)
    features['adv_count_norm'] = adv_count / len(document_pos)
    # corpus statistics
    features['lexical_richness'] = len(document_words) / len(document) # lexical richness of email
    features['total_char_count'] = 0 # total character count of email
    for word in document:
        features['total_char_count'] += len(word)
    word_lengths = [len(word) for word in document]
    features['mean_word_length'] = sum(word_lengths) / len(word_lengths) # mean word length of email
    return features    

# function to get feature sets for modeling in Experiment 2
def getFeatureSets4(documents, stopwords):
    # get word features based on specified stopwords
    word_features = getWordFeatures(documents, stopwords)
    # get bigram features based on specified stopwords
    bigram_features = getBigramFeatures(documents, stopwords)
    featuresets = [(document_features4(d, word_features, bigram_features), c) for (d, c) in documents]
    return featuresets

**Feature Extraction**
- Two feature sets that either use (unigrams + bigrams + normalized pos tag frequencies) or (unigrams + bigrams + normalized pos tag frequencies + corpus statistics)
  - posBasic4000 uses 3000 word features, 1000 bigram features (as defined in Experiment 2), and 4 additional pos tag features (defined in Experiment 3)
  - corpStats includes features defined in Experiment 3 as well as additional corpus statistics

**Model Evaluation: Unigrams + Bigrams + POS Tags**

In [None]:
# 5-fold cross-validation on training set
cross_validation_accuracy(5, posBasic4000Train)
# train a classifier and predict test set: get evaluation metrics; plot confusion matrix
getClassScores(posBasic4000Train, posBasic4000Test)

**Model Evaluation: Unigrams + Bigrams + POS Tags + Corpus Statistics**

In [None]:
# 5-fold cross-validation on training set
cross_validation_accuracy(5, corpStatsTrain)
# train a classifier and predict test set: get evaluation metrics; plot confusion matrix
getClassScores(corpStatsTrain, corpStatsTest)

# **Experiment 5: Comparison of Classification Algorithms in NLTK and Sci-Kit Learn**
The objective of this experiment is to test the performance of a different classification algorithm and compare it to the best baseline model from Experiment 4. The featureset content will essentially remain unchanged, but the comparison will be on the different classifiers.

Common featureset for both classifiers:

- Top 3000 unigrams by frequency with basic stopwords exclusion
- Top 1000 bigrams scored by frequency with basic stopwords exclusion
- Normalized POS tag frequency (no stopwords removed) that aggregates for nouns, verbs, adjectives, and adverbs using the default NLTK (Stanford) tagger
- Lexical richness, email character count, and mean word length

The featureset may be formatted to an array/sparse matrix to comply with the Sci-Kit Learn classifier specifications. For the purposes of this experiment, the default modeling tuning parameters will be used.

Baseline algorithm/classifier:

- Naive Bayes Classifier from NLTK

Test algorithm/classifier:

- Linear SVC (support vector classification)


# **Feature Set Conversion for Sci-Kit Learn Classifier**

In [None]:
#Using Pandas
features = [f for (f,c) in corpStats]
labels = [c for (f,c) in corpStats]
# pandas data frame of features
X = pd.DataFrame(features)
y = np.array(labels)
# train / test split (70/30)
X_train = X.iloc[:3620, :]
X_test = X.iloc[3620:, :]
y_train = y[:3620]
y_test = y[3620:]

**Model Evaluation: Naive Bayes Classifier from NLTK**

- 10-fold cross-validation accuracy, precision, recall, F1, and confusion matrix

In [None]:
# 10-fold cross-validation on training set
cross_validation_accuracy(10, corpStatsTrain)
# train a classifier and predict test set: get evaluation metrics; plot confusion matrix
getClassScores(corpStatsTrain, corpStatsTest)

**Model Evaluation: Linear SVC from Sci-Kit Learn**
- 10-fold cross-validation accuracy, precision, recall, F1, and confusion matrix

In [None]:
# train classifier
classifier = LinearSVC(C=1, penalty='l1', dual=False, class_weight='balanced')
# 10-fold cross-validation on training set
np.random.seed(111)
y_pred = cross_val_predict(classifier, X_train, y_train, cv=10)

In [None]:
# classification report of cross validation results from training set
print(classification_report(y_train, y_pred))

In [None]:
# train classifer to predict test set
svm = classifier.fit(X_train, y_train)
preds = svm.predict(X_test)
# evaluation measures
eval_measures(y_test, preds)
getCM(y_test, preds)