##Goal:  Train a Naive Bayes model to classify future SMS messages as either spam or ham.

Steps:

1.  Convert the words ham and spam to a binary indicator variable(0/1)

2.  Convert the txt to a sparse matrix of TFIDF vectors

3.  Fit a Naive Bayes Classifier

4.  Measure your success using roc_auc_score



In [65]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [66]:
df= pd.read_csv("SMSSpamCollection",sep='\t', names=['spam', 'txt'])

In [67]:
df.head()

Unnamed: 0,spam,txt
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [68]:
##Train the classifier if it is spam or ham based on the text

In [69]:
#TFIDF Vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [70]:
##Convert the spam and ham to 1 and 0 values respectively for probability testing

In [71]:
df.spam.replace('spam', 1, inplace=True)

In [72]:
df.spam.replace('ham', 0, inplace=True)

In [73]:
df.head()

Unnamed: 0,spam,txt
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [74]:
df.shape

(5572, 2)

In [75]:
##Our dependent variable will be 'spam' or 'ham' 
y = df.spam

In [76]:
#Convert df.txt from text to features
X = vectorizer.fit_transform(df.txt)

In [77]:
print y.shape
print X.shape

(5572L,)
(5572, 8605)


In [78]:
##Split the test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [79]:
##Train Naive Bayes Classifier
## Fast (One pass)
## Not affected by sparse data, so most of the 8605 words dont occur in a single observation
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [80]:
y_test

3245    0
944     0
1044    0
2484    0
812     0
2973    0
2991    0
2942    0
230     0
1181    0
1912    0
1992    0
5435    0
4805    0
401     1
1859    0
1344    0
2952    0
501     0
3337    0
1945    0
3142    0
2422    0
381     0
5567    1
4937    0
79      0
5240    0
2554    0
5345    0
       ..
3007    0
5080    0
257     0
507     0
3105    0
99      0
2940    0
2004    0
5398    0
4519    0
2209    1
1740    0
2722    0
4127    1
4028    0
4562    0
366     0
5065    0
5419    0
4318    0
3132    1
949     0
2450    0
19      1
1971    0
668     0
218     0
5536    0
1657    0
3875    0
Name: spam, dtype: int64

In [81]:
##Check for null values in spam

In [82]:
df[df.spam.isnull()]

Unnamed: 0,spam,txt


In [83]:
##There are no null values

In [84]:
clf.predict_proba(X_test)[:,1]

array([ 0.05351863,  0.04740787,  0.01124889, ...,  0.002405  ,
        0.01745264,  0.08290319])

In [85]:
##Check model's accuracy
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.98558587451336732

In [86]:
#With the model, the success rate is ~98.56%