In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 

# Building a SMS spam detector

In [2]:
# load the dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/sms.tsv.txt'
sms = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

In [3]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# spam example
print(sms[sms.label=='spam'].message.iloc[100])

To review and KEEP the fantastic Nokia N-Gage game deck with Club Nokia, go 2 www.cnupdates.com/newsletter. unsubscribe from alerts reply with the word OUT


In [5]:
# ham example
print(sms[sms.label=='ham'].message.iloc[100])

Hmm...my uncle just informed me that he's paying the school directly. So pls buy food.


In [6]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
# feature matrix/target vector
X = sms.message
y = sms.label

In [8]:
# train/test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [9]:
# initialize the vectorizer (with default parameters)
vect = CountVectorizer()

In [10]:
# learn training vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [11]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)

In [12]:
# import and initialize a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [13]:
# train the model using X_train_dtm 
nb.fit(X_train_dtm, y_train)

MultinomialNB()

In [14]:
# make class predictions for X_test_dtm
y_test_pred = nb.predict(X_test_dtm)

In [15]:
# evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix

In [16]:
# accuracy
accuracy_score(list(y_test), y_test_pred)

0.9806173725771715

In [17]:
# confusion matrix
confusion_matrix(y_test, y_test_pred)

array([[1201,    9],
       [  18,  165]], dtype=int64)

In [18]:
# print messages text for the false positives (ham incorrectly classified as spam) 
X_test[(y_test=='ham') & (y_test_pred=='spam')]

4557                              Gettin rdy to ship comp
574                                Waiting for your call.
45                       No calls..messages..missed calls
228                        Hey company elama po mudyadhu.
4702                               I liked the new mobile
1988                     No calls..messages..missed calls
3375                              Also andros ice etc etc
4703                                           Anytime...
5046    We have sent JD for Customer Service cum Accou...
Name: message, dtype: object

In [34]:
# example of false positive
X_test[4557]

'Gettin rdy to ship comp'

In [20]:
# print messages text for the false negatives (span incorrectly classified as ham)  
X_test[(y_test=='spam') & (y_test_pred=='ham')]

4144    In The Simpsons Movie released in July 2007 na...
1469    Hi its LUCY Hubby at meetins all day Fri & I w...
4213    Missed call alert. These numbers called but le...
2352    Download as many ringtones as u like no restri...
1875    Would you like to see my XXX pics they are so ...
1500    SMS. ac JSco: Energy is high, but u may not kn...
4968    You can donate £2.50 to UNICEF's Asian Tsunami...
684     Hi I'm sue. I am 20 years old and work as a la...
1638    0A$NETWORKS allow companies to bill for SMS, s...
2248    Back 2 work 2morro half term over! Can U C me ...
3742                                        2/2 146tf150p
4256    Block Breaker now comes in deluxe format with ...
4069    TBS/PERSOLVO. been chasing us since Sept for£3...
5449    Latest News! Police station toilet stolen, cop...
3132    LookAtMe!: Thanks for your purchase of a video...
1458    CLAIRE here am havin borin time & am now alone...
4676    Hi babe its Chloe, how r u? I was smashed on s...
2575    Your n

In [35]:
# example of false negatives
X_test[4256]

'Block Breaker now comes in deluxe format with new features and great graphics from T-Mobile. Buy for just £5 by replying GET BBDELUXE and take the challenge'

## Compare Naive Bayes to Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
log_clf = LogisticRegression()

In [29]:
log_clf.fit(X_train_dtm,y_train)

LogisticRegression()

In [30]:
y_test_pred = log_clf.predict(X_test_dtm)

In [31]:
accuracy_score(y_test,y_test_pred)

0.9791816223977028

In [32]:
confusion_matrix(y_test,y_test_pred)

array([[1208,    2],
       [  27,  156]], dtype=int64)