In [18]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

## Read Data In

In [19]:
data_file = "smsspamcollection/SMSSpamCollection"

In [23]:
texts = pd.read_csv(data_file, names=["spam", "text"], sep="\t") # read the data file

In [24]:
texts.head() # looks correct

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
#texts.loc[texts["spam"] == "spam", "spam"] = 1 # these would be necessary if not using CountVectorizer

In [13]:
#texts.loc[texts["spam"] == "ham", "spam"] = 0

In [25]:
#texts.head()

##Create the Spam Predictor

In [26]:
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('bayes', MultinomialNB())])
spam_pipe

Pipeline(steps=[('bag_of_words', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
     ...ear_tf=False, use_idf=True)), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [44]:
len(texts)

5572

In [27]:
txt_array = np.asarray(texts)  # dividing up data into training and testing sets
a = txt_array[:,:-1]
b = txt_array[:,-1]
x_train, x_test, y_train, y_test = train_test_split(a, b, test_size=0.4) # I might have had the x & y backwards

In [28]:
len(y_test) # looks like the number of records we should expect in the test set

2229

In [38]:
np.ravel(x_test) # this is needed to make the spam scores into a row array

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [40]:
spam_pipe.fit(y_train, np.ravel(x_train))

Pipeline(steps=[('bag_of_words', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
     ...ear_tf=False, use_idf=True)), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

##Test Effectivness of the Spam Filter

In [42]:
spam_pipe.score(y_train, np.ravel(x_train))

0.96739455578821421

In [43]:
spam_pipe.score(y_test, np.ravel(x_test))

0.9546882009869897

The combination of tools used in this assigment were surprisingly effective at identifying spam.

In [45]:
spam_pipe.predict(["hi mom!"]) # it seems to let this one pass

array(['ham'], 
      dtype='<U4')

In [48]:
spam_pipe.predict(["URGENT! King of Nigeria NEEDS your help for prescription pills"]) # didn't get this one

array(['ham'], 
      dtype='<U4')

In [71]:
spam_texts =   texts[texts["spam"] == "spam"]["text"]
normal_texts = texts[texts["spam"] == "ham"]["text"]

In [59]:
len(spam_texts)

747

In [68]:
pd.DataFrame(spam_pipe.predict(spam_texts))[0].value_counts()

spam    537
ham     210
dtype: int64

This isn't quite as good as I expected, particularly for the 0.95 correlation coefficient.

In [72]:
pd.DataFrame(spam_pipe.predict(normal_texts))[0].value_counts()

ham    4825
dtype: int64

This is an interesting component to the picture. We see that not a single normal text was put into the spam folder. This might be useful for practical purposes, erroring on the side of classifying texts as non-spam. This might also be the main factor that accounts for the high R^2 correlation scores.