In [1]:
# Go to source root
import os
os.chdir('../src/')

In [2]:
from email_ingestion import EmailIngester

ham_path = '../data/raw/ham/beck-s/'
spam_path = '../data/raw/spam/GP/'

ingester = EmailIngester(tokenize=False)  # No tokenization. Will use sklearn.
ham_data = ingester.ingest_folder(ham_path)
spam_data = ingester.ingest_folder(spam_path)

Processed 1966 / 1966 emails (1.274s).
Processed 13705 / 13719 emails (23.198s).


In [3]:
def add_label(data, is_spam=False):
    for record in data:
        record['is_spam'] = is_spam
        
add_label(ham_data, is_spam=False)
add_label(spam_data, is_spam=True)
email_data = ham_data + spam_data

### Corpus indexing
Before generating bag of words features, the entire corpus of tokens needs to be indexed.

In [None]:
def generate_corpus(data):
    word_counts = count_words(data)
    
    filter_infrequent_words(word_counts, threshold=5)
    filter_long_words(word_counts, threshold=20)
    
    return word_counts


def count_words(data):
    word_counts = {}

    for record in spam_data:

        if 'tokens' not in record:
            continue

        for token in record['tokens']:
            if token in word_counts:
                word_counts[token] += 1
            else:
                word_counts[token] = 1
    return word_counts    


def filter_infrequent_words(word_counts, threshold=5):
    infrequent_words = []
    
    for word in word_counts:
        if word_counts[word] < threshold:
            infrequent_words.append(word)
            
    for word in infrequent_words:
        word_counts.pop(word)
        
def filter_long_words(word_counts, threshold=20):
    long_words = []
    
    for word in word_counts:
        if len(word) > threshold:
            long_words.append(word)
            
    for word in long_words:
        word_counts.pop(word)

### Characteristics of unigrams

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# TODO No longer works b/c tokenization
corpus = generate_corpus(email_data)

word_freq = pd.Series(corpus)
word_freq.hist(bins=range(5, 50))
plt.title('Word frequencies')
plt.show()

word_len = pd.Series(map(lambda s: len(s), list(corpus.keys())))
word_len.hist(bins=range(3, 20))
plt.title("Word lenght")
plt.show()

print("Corpus length:", len(corpus))
word_freq.sort_values()



### Actual feature generation

In [4]:
from sklearn.feature_extraction.text import HashingVectorizer

# Generate dataset
text_data = []
for email in email_data:
    record = ''
    if email['subject'] is not None:
        record += email['subject'] + '\n'
    if 'body' in email:
        record += email['body']

    text_data.append(record)

# Generate labels
y = []
for email in email_data:
    y.append(email['is_spam'])

# Split dataset
# TODO Split on HAM date, not random.
# TODO Generate proper training data
from sklearn.model_selection import train_test_split
data_train, data_test, y_train, y_test = train_test_split(text_data, y, test_size=0.33, random_state=42)

# Generate features
vectorizer = HashingVectorizer(
    stop_words='english', non_negative=True,
    n_features=2**16
)
X_train = vectorizer.transform(data_train)
X_test = vectorizer.transform(data_test)


In [16]:
from time import time

def benchmark(clf):
    print('_' * 80)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time:   %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:    %0.3fs" % test_time)

    spam_score = metrics.recall_score(y_test, pred, pos_label=True)
    ham_score = metrics.recall_score(y_test, pred, pos_label=False)
    print("Spam recall:  %0.3f" % spam_score)
    print("Ham recall:   %0.3f" % ham_score)
    print()

    if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=("HAM", "SPAM")))
    print()

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))
    print()


from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

________________________________________________________________________________
train time:   1.970s
test time:    0.045s
Spam recall:  0.998
Ham recall:   0.914

classification report:
             precision    recall  f1-score   support

        HAM       0.99      0.91      0.95       636
       SPAM       0.99      1.00      0.99      4536

avg / total       0.99      0.99      0.99      5172


confusion matrix:
[[ 581   55]
 [   7 4529]]



In [5]:
from sklearn import metrics
from sklearn.utils.extmath import density

def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

    print("classification report:")
    print(metrics.classification_report(y_test, pred))
    print()

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))
    print()
    
    clf_descr = str(clf).split('(')[0]
#     return clf_descr, score, train_time, test_time


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty=penalty)))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
  ('classification', LinearSVC())
])))

# make some plots

indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(4)]

clf_names, score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='navy')
plt.barh(indices + .3, training_time, .2, label="training time",
         color='c')
plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)

plt.show()