* [Ling-Spam dataset](http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=MachineLearning&doc=exercises/ex6/ex6.html)
  * Preprocessed
* [Bag of Words](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)
  * CountVectorizer
    * binary=True
* [Multinomial vs Bernoulli Naive Bayes](http://scikit-learn.org/stable/modules/naive_bayes.html)
* [20 Newsgroups](http://scikit-learn.org/stable/datasets/index.html#the-20-newsgroups-text-dataset)

In [34]:
import glob

In [57]:
def read_ling_spam(directory):
    files = glob.glob("ling-spam/{}/*.txt".format(directory))
    texts = []
    for file in files:
        with open(file) as fh:
            texts.append(fh.read())
    return texts

spam_train = read_ling_spam("spam-train")
non_spam_train = read_ling_spam("nonspam-train")
spam_test = read_ling_spam("spam-test")
non_spam_test = read_ling_spam("nonspam-test")

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

In [100]:
vectorizer = CountVectorizer()
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [60]:
vectorizer.fit(non_spam_train + spam_train)

X_train = vectorizer.transform(non_spam_train + spam_train)
y_train = ['non-spam'] * len(non_spam_train) + ['spam'] * len(spam_train)
X_test = vectorizer.transform(non_spam_test + spam_test)
y_test = ['non-spam'] * len(non_spam_test) + ['spam'] * len(spam_test)

In [88]:
vectorizer.get_feature_names()[1000:1005]

['armidale', 'armin', 'armstrong', 'army', 'arnagardur']

In [62]:
X_train

<700x19073 sparse matrix of type '<class 'numpy.int64'>'
	with 110609 stored elements in Compressed Sparse Row format>

In [63]:
print(len(spam_train + non_spam_train), len(vectorizer.get_feature_names()))

700 19073


In [91]:
print(X_train[0,:1000])

  (0, 846)	1


In [92]:
from sklearn.naive_bayes import MultinomialNB

In [93]:
classifier = MultinomialNB()

In [94]:
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [95]:
classifier.score(X_train, y_train)

0.99857142857142855

In [96]:
classifier.score(X_test, y_test)

0.97692307692307689

In [98]:
classifier.predict(X_test[0])

array(['non-spam'], 
      dtype='<U8')

In [102]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

In [111]:
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('bayes', MultinomialNB())])
spam_pipe

Pipeline(steps=[('bag_of_words', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
     ...ear_tf=False, use_idf=True)), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [108]:
spam_pipe.fit(non_spam_train + spam_train, y_train)

Pipeline(steps=[('bag_of_words', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [109]:
spam_pipe.predict(non_spam_test)

array(['non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam', 'spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'spam', 'non-spam',
       'non-spam', 'spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam', 'non-spam', 'non-spam', 'non-spam',
       'non-spam', 'non-spam',

In [110]:
spam_pipe.score(non_spam_test + spam_test, y_test)

0.97692307692307689