In [24]:
from zipfile import ZipFile
import pandas as pd
from sklearn import cross_validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
from sklearn.pipeline import Pipeline

In [2]:
with ZipFile('./smsspamcollection.zip', 'r') as zipped:
    with zipped.open('SMSSpamCollection', 'r') as fh:
        data = fh.readlines()
results = []
for idx in range(len(data)):

    if re.match(b'ham\t', data[idx]):
        results.append(0)
        data[idx] = data[idx][4:]
    else:
        results.append(1)
        data[idx] = data[idx][5:]


In [13]:
token_pattern = r'(?u)\b[\w.,:()[\w.,:()]+\b'  # let it take weird punctuation

In [14]:
vect = CountVectorizer(token_pattern=token_pattern, analyzer='word')
nb = MultinomialNB()

In [15]:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, \
                                         results, test_size=.4, random_state=144)

In [16]:
pipe = Pipeline([('vect', vect),('nb', nb)])
analyzer = vect.build_analyzer()
analyzer('he-l_lo I a:m, 4a, 33,5 ju:gger.naut') # testing the token_pattern

['he', 'l_lo', 'i', 'a:m', '4a', '33,5', 'ju:gger.naut']

In [17]:
pipe.fit(X_train, Y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b[\\w.,:()[\\w.,:()]+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [18]:
pipe.score(X_test, Y_test)

0.98116591928251118

In [29]:
predictions = pipe.predict(data)
print('Pipeline results(all data):', sum(predictions), len(predictions) - sum(predictions))
print('Actual values:', sum(results), len(results) - sum(results))
print('Success level: ',100 - sum(abs(results - predictions)) / len(results))

Pipeline results(all data): 715 4859
Actual values: 747 4827
Success level:  99.9881593111


In [30]:
predictions = pipe.predict(X_test)
print('Pipeline results(test values):', sum(predictions), len(predictions) - sum(predictions))
print('Actual values:', sum(Y_test), len(results) - sum(Y_test))
print('Success level: ',100 - sum(abs(Y_test - predictions))  / len(Y_test))

Pipeline results(test values): 266 1964
Actual values: 288 5286
Success level:  99.9811659193
