In [1]:
import glob

def read_ling_spam(directory):
    files = glob.glob('ling-spam/{}/*.txt'.format(directory))
    texts = []
    for file in files:
        with open(file) as f:
            texts.append(f.read())
    return texts

In [2]:
spam_train = read_ling_spam('spam-train')
ham_train = read_ling_spam('nonspam-train')
spam_test = read_ling_spam('spam-test')
ham_test = read_ling_spam('nonspam-test')

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vectorizer = CountVectorizer()
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
vectorizer.fit(spam_train + ham_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [8]:
print('feature names:', len(vectorizer.get_feature_names()))

feature names: 19073


In [11]:
vectorizer.get_feature_names()[-10:]

['zus',
 'zusammengest',
 'zusammenstellung',
 'zvegintseva',
 'zwart',
 'zwischen',
 'zxgahqabjh',
 'zybatov',
 'zybatow',
 'zz']

In [10]:
X_train = vectorizer.transform(ham_train + spam_train)
y_train = ['ham'] * len(ham_train) + ['spam'] * len(spam_train)

X_test = vectorizer.transform(ham_test + spam_test)
y_test = ['ham'] * len(ham_test) + ['spam'] * len(spam_test)

In [12]:
print('train rows:', len(spam_train + ham_train))
print('test rows:', len(spam_test + ham_test))

train rows: 700
test rows: 260


In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
classifier = MultinomialNB()

In [15]:
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
print('train score:', classifier.score(X_train, y_train))
print('test score:', classifier.score(X_test, y_test))

train score: 0.998571428571
test score: 0.976923076923


In [17]:
message = ['hi how are you i have a loan for you to consider today my friend is a prince has large sum of money']

print('predict spam:', classifier.predict(vectorizer.transform(message)))

predict spam: ['spam']


In [18]:
message = ['i think the natural language processing features of scikit are very useful']

print('predict ham:', classifier.predict(vectorizer.transform(message)))

predict ham: ['ham']
