In [29]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
header_names = ['Spam', 'Message']
info = pd.read_csv("smsspamcollection/SMSSpamCollection", sep='\t', header=None, names=header_names)

In [39]:
info.head()

Unnamed: 0,Spam,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [40]:
X = info.copy()

In [41]:
y = X.pop('Spam')

In [53]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33)

In [57]:
test_X.head()

Unnamed: 0,Message
2428,Do you think i can move &lt;#&gt; in a week
426,Ok. She'll be ok. I guess
277,Tell rob to mack his gf in the theater
3514,Staff of placement training in Amrita college.
2459,"Cool, I'll text you when I'm on the way"


In [58]:
vectorizer = CountVectorizer()
vectorizer.fit(train_X['Message'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [59]:
X_train = vectorizer.transform(train_X['Message'])
y_train = train_y
X_test = vectorizer.transform(test_X['Message'])
y_test = test_y

In [101]:
X_train

<3733x6996 sparse matrix of type '<class 'numpy.int64'>'
	with 49381 stored elements in Compressed Sparse Row format>

In [70]:
y_test.tail(10)

5459     ham
3126    spam
1619     ham
5052     ham
937      ham
1262     ham
582      ham
3270     ham
2587     ham
3812     ham
Name: Spam, dtype: object

In [63]:
vectorizer.get_feature_names()[2000:2005]

['degree', 'degrees', 'dehydration', 'del', 'delay']

In [64]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [65]:
classifier.score(X_train, y_train)

0.994910259844629

In [66]:
classifier.score(X_test, y_test)

0.98368678629690054

In [120]:
classifier.predict(X_test[0])

array(['ham'], 
      dtype='<U4')

###With Pipeline

In [121]:
spam_pipe = Pipeline([('bag_of_words', CountVectorizer()),
                      ('bayes', MultinomialNB())])

In [122]:
trainX = train_X['Message']

In [123]:
spam_pipe.fit(trainX, train_y)

Pipeline(steps=[('bag_of_words', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [124]:
spam_pipe.score(trainX, train_y)

0.994910259844629

In [125]:
spam_pipe.predict(['Hello there'])

array(['ham'], 
      dtype='<U4')

In [126]:
spam_pipe.predict(['money free winner'])

array(['spam'], 
      dtype='<U4')