In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 

# Building a SMS spam detector

In [19]:
# load the dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/sms.tsv.txt'
sms = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

In [20]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [33]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [21]:
# feature matrix/target vector
X = sms.message
y = sms.label

In [22]:
# train/test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [23]:
# initialize the vectorizer (with default parameters)
vect = CountVectorizer()

In [24]:
# learn training vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [26]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)

In [27]:
# import and initialize a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [28]:
# train the model using X_train_dtm 
nb.fit(X_train_dtm, y_train)

MultinomialNB()

In [30]:
# make class predictions for X_test_dtm
y_test_pred = nb.predict(X_test_dtm)

In [46]:
# evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix

In [56]:
# accuracy
accuracy_score(list(y_test), y_test_pred)

0.9856424982053122

In [48]:
# confusion matrix
confusion_matrix(y_test, y_test_pred)

array([[1210,    4],
       [  16,  163]], dtype=int64)

In [70]:
# print messages text for the false positives (ham incorrectly classified as spam) 
X_test[(y_test=='ham') & (y_test_pred=='spam')]

574                                Waiting for your call.
5046    We have sent JD for Customer Service cum Accou...
326                      No calls..messages..missed calls
4382               Mathews or tait or edwards or anderson
Name: message, dtype: object

In [71]:
# example of false positive
X_test[574]

'Waiting for your call.'

In [68]:
# print messages text for the false negatives (span incorrectly classified as ham)  
X_test[(y_test=='spam') & (y_test_pred=='ham')]

4514    Money i have won wining number 946 wot do i do...
4821    Check Out Choose Your Babe Videos @ sms.shsex....
5037    You won't believe it but it's true. It's Incre...
2575    Your next amazing xxx PICSFREE1 video will be ...
3419    LIFE has never been this much fun and great un...
3864    Oh my god! I've found your number again! I'm s...
5370    dating:i have had two of these. Only started a...
5449    Latest News! Police station toilet stolen, cop...
3574    You won't believe it but it's true. It's Incre...
4256    Block Breaker now comes in deluxe format with ...
2430    Guess who am I?This is the first time I create...
2774    How come it takes so little time for a child w...
2558    This message is brought to you by GMW Ltd. and...
1940    More people are dogging in your area now. Call...
4213    Missed call alert. These numbers called but le...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
Name: message, dtype: object

In [75]:
# example of false negatives
X_test[2430]

"Guess who am I?This is the first time I created a web page WWW.ASJESUS.COM read all I wrote. I'm waiting for your opinions. I want to be your friend 1/1"

## Compare Naive Bayes to Logistic Regression

In [76]:
from sklearn.linear_model import LogisticRegression

In [85]:
log_clf = LogisticRegression()

In [86]:
log_clf.fit(X_train_dtm,y_train)

LogisticRegression()

In [87]:
y_test_pred = log_clf.predict(X_test_dtm)

In [88]:
accuracy_score(y_test,y_test_pred)

0.9770279971284996

In [89]:
confusion_matrix(y_test,y_test_pred)

array([[1212,    2],
       [  30,  149]], dtype=int64)