### Classifying emails using the naive Bayes classifier

http://yudkowsky.net/rational/bayes

1) The Ling-Spam corpus: http://csmining.org/index.php/ling-spam-datasets.html

2) The Hewlett-Packard spam database: https://archive.ics.uci.edu/ml/machine-learning-databases/spambase

*3) The Enrom-Spam dataset: http://www.aueb.gr/users/ion/data/enron-spam*

4) The Apache SpamAssassin public corpus: http://csmining.org/index.php/spam-assassin-datasets.html

In [1]:
HAM = 0
SPAM = 1
datadir = './data/'
sources = [
    ('beck-s.tar.gz', HAM),
    ('farmer-d.tar.gz', HAM),
    ('kaminski-v.tar.gz', HAM),
    ('kitchen-l.tar.gz', HAM),
    ('lokay-m.tar.gz', HAM),
    ('williams-w3.tar.gz', HAM),
    ('BG.tar.gz', SPAM),
    ('GP.tar.gz', SPAM),
    ('SH.tar.gz', SPAM)
]

In [2]:
def extract_tar(datafile, extractdir):
    try:
        import tarfile
    except ImportError:
        raise ImportError("You do not have tarfile installed. "
                          "Try unzipping the file outside of Python.")

    tar = tarfile.open(datafile)
    tar.extractall(path=extractdir)
    tar.close()
    print("%s successfully extracted to %s" % (datafile, extractdir))

for source, _ in sources:
    datafile = '%s/%s' % (datadir, source)
    extract_tar(datafile, datadir)

./data//beck-s.tar.gz successfully extracted to ./data/
./data//farmer-d.tar.gz successfully extracted to ./data/
./data//kaminski-v.tar.gz successfully extracted to ./data/
./data//kitchen-l.tar.gz successfully extracted to ./data/
./data//lokay-m.tar.gz successfully extracted to ./data/
./data//williams-w3.tar.gz successfully extracted to ./data/
./data//BG.tar.gz successfully extracted to ./data/
./data//GP.tar.gz successfully extracted to ./data/
./data//SH.tar.gz successfully extracted to ./data/


In [3]:
import os
def read_single_file(filename):
    past_header, lines = False, []
    if os.path.isfile(filename):
        f = open(filename, encoding="latin-1")
        for line in f:
            if past_header:
                lines.append(line)
            elif line == '\n':
                past_header = True
        f.close()
    content = '\n'.join(lines)
    return filename, content

In [4]:
def read_files(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(root, filename)
            yield read_single_file(filepath)

In [5]:
import pandas as pd

In [6]:
pd.DataFrame({
    'model': ['Normal Bayes', 'Multinomial Bayes', 'Bernoulli Bayes'],
    'class': [
        'cv2.ml.NormalBayesClassifier_create()',
        'sklearn.naive_bayes.MultinomialNB()',
        'sklearn.naive_bayes.BernoulliNB()'
    ]
})

Unnamed: 0,class,model
0,cv2.ml.NormalBayesClassifier_create(),Normal Bayes
1,sklearn.naive_bayes.MultinomialNB(),Multinomial Bayes
2,sklearn.naive_bayes.BernoulliNB(),Bernoulli Bayes


In [7]:
def build_data_frame(extractdir, classification):
    rows = []
    index = []
    for file_name, text in read_files(extractdir):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = pd.DataFrame(rows, index=index)
    return data_frame

In [8]:
data = pd.DataFrame({'text': [], 'class': []})
for source, classification in sources:
    extractdir = '%s/%s' % (datadir, source[:-7])
    data = data.append(build_data_frame(extractdir, classification))

In [9]:
from sklearn import feature_extraction
counts = feature_extraction.text.CountVectorizer()
X = counts.fit_transform(data['text'].values)
X.shape

(52076, 643270)

In [10]:
X

<52076x643270 sparse matrix of type '<class 'numpy.int64'>'
	with 8607632 stored elements in Compressed Sparse Row format>

In [11]:
y = data['class'].values

In [12]:
from sklearn import model_selection as ms
X_train, X_test, y_train, y_test = ms.train_test_split(
    X, y, test_size=0.2, random_state=42)

In [13]:
import cv2
model_norm = cv2.ml.NormalBayesClassifier_create()

In [14]:
import numpy as np
X_train_small = X_train[:1000, :300].toarray().astype(np.float32)
y_train_small = y_train[:1000]

In [15]:
# model_norm.train(X_train_small, cv2.ml.ROW_SAMPLE, y_train_small)

In [16]:
from sklearn import model_selection as ms
X_train, X_test, y_train, y_test = ms.train_test_split(
    X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn import naive_bayes
model_naive = naive_bayes.MultinomialNB()
model_naive.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
model_naive.score(X_train, y_train)

0.9508641382621219

In [19]:
model_naive.score(X_test, y_test)

0.9442204301075269

In [20]:
counts = feature_extraction.text.CountVectorizer(ngram_range=(1, 2))
X = counts.fit_transform(data['text'].values)

In [21]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
model_naive = naive_bayes.MultinomialNB()
model_naive.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
model_naive.score(X_test, y_test)

0.9706221198156681

In [24]:
tfidf = feature_extraction.text.TfidfTransformer()

In [25]:
X_new = tfidf.fit_transform(X)

In [26]:
X_train, X_test, y_train, y_test = ms.train_test_split(X_new, y, test_size=0.2, random_state=42)

In [27]:
model_naive = naive_bayes.MultinomialNB()
model_naive.fit(X_train, y_train)
model_naive.score(X_test, y_test)

0.9908794162826421

In [28]:
from sklearn import metrics

In [29]:
metrics.confusion_matrix(y_test, model_naive.predict(X_test))

array([[3746,   84],
       [  11, 6575]], dtype=int64)

This tells us we got 3,746 class 0 classifications correct, and 6,575 class 1 classifications correct. We confused 84 samples of class 0 as belonging to class 1 and 11 samples of class 1 as belonging to class 0.