# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [16]:
import os
import io
import numpy 
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})
spam_emails = dataFrameFromDirectory('/home/akai/projects/machine-learning-hands-on-python/DataScience/emails/spam', 'spam')
msk = numpy.random.rand(len(spam_emails)) < 0.8
training_set = spam_emails[msk]
test_set = spam_emails[~msk]
data = data.append(training_set)
data = data.append(dataFrameFromDirectory('/home/akai/projects/machine-learning-hands-on-python/DataScience/emails/ham', 'ham'))


Let's have a look at that DataFrame:

In [23]:
data.head()
training_set.head()
test_set.head()

/home/akai/projects/machine-learning-hands-on-python/DataScience/emails/spam/00179.2174c80cb3eff623dfc991e51a53eb99    iieo\n\n<html>\n\n<head>\n\n<title>Mortgage co...
/home/akai/projects/machine-learning-hands-on-python/DataScience/emails/spam/00075.28a918cd03a0ef5aa2f1e0551a798108    Dear Sir / Madam\n\n\n\nIf you are fed up of b...
/home/akai/projects/machine-learning-hands-on-python/DataScience/emails/spam/00147.1782d51354c31ea53db25ea927d5c51d    <HTML><HEAD><TITLE>Big and big</TITLE>\n\n<MET...
/home/akai/projects/machine-learning-hands-on-python/DataScience/emails/spam/00057.0a2e17bde9485e999ac2259df38528e2    Lowest rates available for term life insurance...
/home/akai/projects/machine-learning-hands-on-python/DataScience/emails/spam/00093.ca4edc32d2ff8e1dbb5f9c0b15ec435b    Get your favorite Poker action at http://www.m...
/home/akai/projects/machine-learning-hands-on-python/DataScience/emails/spam/00226.e0e2704cde3bbd561a98042f4a3baf5f    Dear Sir or Madam,\n\n\n\nMy name is

Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [30]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)
print counts.toarray()
classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

[[0 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's try it out:

In [20]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['ham', 'ham'], 
      dtype='|S4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [24]:
test_counts = vectorizer.transform(test_set.message)
predictions = classifier.predict(test_counts)
predictions

array(['spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam',
       'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam',
       'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam',
       'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham',
       'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam',
       'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham',
       'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam',
       'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham',
       'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'spam',
       'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam',
       'ham', 'ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam',
       'spam'], 
      dtype='|S4')