In [1]:
import pandas as pd

# read file into pandas using a relative path
path = "D:\ML Internship\sms.tsv"
sms = pd.read_table(path, header=None, names=['label', 'message'])

In [2]:
sms.shape

(5572, 2)

In [3]:
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
# examine the class distribution
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
# convert label to a numerical variable because some classification algorithms want their classes to be of numeric values as well.
sms['label'] = sms.label.map({'ham':0, 'spam':1})

In [6]:
# check that the conversion worked
sms.head(10)

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [7]:
#Some preprocessing needs to be done before we start extracting features.

# Making everything lower case 
sms['message'] = sms['message'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [8]:
sms.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [9]:
# Removing Punctuation

sms['message'] = sms['message'].str.replace('[^\w\s]','')
sms['message'].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: message, dtype: object

In [10]:
import nltk

In [11]:
from nltk.corpus import stopwords

In [12]:
#Removing stopwords

stop = stopwords.words('english')
sms['message'] = sms['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
sms['message'].head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
Name: message, dtype: object

In [13]:
# For correcting spelling mistakes

from textblob import TextBlob
sms['message'].apply(lambda x: str(TextBlob(x).correct()))

0       go during point crazy available boris n great ...
1                                   ok war joking if u on
2       free entry 2 wily come win a cup final this mu...
3                     u dun say early for u c already say
4               ah dont think goes us lives around though
5       freemen hey darling 3 weeks word back id like ...
6          even brother like speak treat like aids patent
7       per request selle selle or minnaminunginte nur...
8       winner valued network customer selected receiv...
9       mobile 11 months u r entitled update latest co...
10      in donna home soon dont want talk stuff anymor...
11      six chances win cash 100 20000 pounds txt csh1...
12      urgent 1 week free membership 100000 prize jac...
13      give searching right words thank breathe promi...
14                                            date sunday
15      xxxmobilemovieclub use credit click was link n...
16                                        oh him watching
17      eh u r

In [13]:
# Stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
sms['message'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkt 21...
3                     u dun say earli hor u c alreadi say
4               nah dont think goe usf live around though
5       freemsg hey darl 3 week word back id like fun ...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea 900...
9       mobil 11 month u r entitl updat latest colour ...
10      im gonna home soon dont want talk stuff anymor...
11      six chanc win cash 100 20000 pound txt csh11 s...
12      urgent 1 week free membership 100000 prize jac...
13      ive search right word thank breather promis wo...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                           oh kim watch
17       eh u 

In [15]:
# Rare words removal

freq = pd.Series(' '.join(sms['message']).split()).value_counts()[-10:]
freq

weathers       1
updat          1
isare          1
honesty        1
7ws            1
canlove        1
stoners        1
senor          1
elaborating    1
barcelona      1
dtype: int64

In [16]:
freq = list(freq.index)
sms['message'] = sms['message'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
sms['message'].head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
Name: message, dtype: object

In [17]:
X = sms.message
y = sms.label

In [18]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer = 'word', min_df=5, max_features=1000)
train_vect = tfidf.fit_transform(X_train)

In [57]:
train_vect

<4179x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 23833 stored elements in Compressed Sparse Row format>

In [58]:
test_vect = tfidf.transform(X_test)

In [59]:
test_vect

<1393x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 7923 stored elements in Compressed Sparse Row format>

In [60]:
# Now we have processed the text to make it useful for running further classification or Regression algorithms

# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [61]:
# train the model using train_vect (timing it with an IPython "magic command")
%time nb.fit(train_vect, y_train)

Wall time: 1.99 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [62]:
# make class predictions for test_vect
y_pred_class = nb.predict(test_vect)

In [63]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9813352476669059

In [64]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1208,    0],
       [  26,  159]], dtype=int64)

In [65]:
# print message text for the false positives (ham incorrectly classified as spam)
X_test[(y_pred_class==1) & (y_test==0)]
#or more elegant way is to use X_test[y_pred_class > y_test]

# There are absolutely no false positives in this case.

Series([], Name: message, dtype: object)

In [66]:
# print message text for the false negatives (spam incorrectly classified as ham)
X_test[(y_pred_class==0) & (y_test==1)]
#or more elegant way is to use 
#X_test[y_pred_class < y_test]

1217              1 new voicemail please call 08719181513
3132    lookatme thanks purchase video clip lookatme y...
2295                1 new message please call 08718738034
420     send logo 2 ur lover 2 names joined heart txt ...
5110                1 new message please call 08715205273
4965    dear voucher holder next meal us use following...
943     getting touch folks waiting company txt back n...
3530    xmas new years eve tickets sale club day 10am ...
684     hi im sue 20 years old work lapdancer love sex...
4073    loans purpose even bad credit tenants welcome ...
1875         would like see xxx pics hot nearly banned uk
1328    ur balance 500 ur next question sang uptown gi...
3755    bloomberg message center 447797706009 wait app...
1893    call 09090900040 listen extreme dirty live cha...
4298    thesmszonecom lets send free anonymous masked ...
4949    hi amy sending free phone number couple days g...
1172    got takes 2 take part wrc rally oz u lucozade ...
761     romant

In [67]:
# Comparing models
#We will compare multinomial Naive Bayes with logistic regression:

# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [68]:
# train the model using X_train_dtm
%time logreg.fit(train_vect, y_train)

Wall time: 9.97 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [69]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(test_vect)

In [70]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.9676956209619526

In [71]:
# Examining a model for further insight
#We will examine the our trained Naive Bayes model to calculate the approximate "spamminess" of each token.

In [72]:
# store the vocabulary of X_train
X_train_tokens = tfidf.get_feature_names()
len(X_train_tokens)

1000

In [73]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

['0800', '08000839402', '08000930705', '08712460324', '10', '100', '1000', '10000', '10p', '10pmin', '11mths', '12', '12hrs', '150', '150p', '150pmsg', '150ppm', '16', '18', '1st', '200', '2000', '2003', '250', '2day', '2nd', '2nite', '300', '3030', '350', '4u', '500', '5000', '750', '800', '8007', '86688', '87066', '900', 'abiola', 'able', 'abt', 'accept', 'account', 'across', 'actually', 'address', 'admirer', 'aft', 'afternoon']


In [74]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

['wil', 'win', 'wine', 'winner', 'wish', 'wishes', 'wit', 'within', 'without', 'wk', 'wkly', 'woke', 'wonder', 'wonderful', 'wont', 'word', 'words', 'work', 'working', 'world', 'worried', 'worry', 'worth', 'wot', 'would', 'wow', 'write', 'wrong', 'wwwgetzedcouk', 'xmas', 'xx', 'xxx', 'xy', 'ya', 'yar', 'yeah', 'year', 'years', 'yep', 'yes', 'yest', 'yesterday', 'yet', 'yo', 'youd', 'youll', 'youre', 'youve', 'yr', 'yup']


In [75]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_

array([[ 0.        ,  0.        ,  0.        , ...,  3.83878372,
         0.72139445, 16.87361079],
       [ 4.13269036,  2.96920949,  3.21125201, ...,  1.18531009,
         2.93756384,  0.        ]])

In [76]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape

(2, 1000)

In [77]:
# number of times each token appears across all HAM messages
ham_token_count = nb.feature_count_[0, :]
ham_token_count

array([ 0.        ,  0.        ,  0.        ,  0.        ,  3.00068097,
        0.63882256,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.69925916,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.708755  ,  0.        ,  3.09129975,
        0.        ,  0.        ,  0.        ,  0.        ,  1.36838677,
        3.28540141,  2.31926667,  0.        ,  0.        ,  0.        ,
        1.91153396,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.29387818,  3.88428253,
        7.21166771, 10.5756323 ,  2.85656087,  4.54525695,  2.57928739,
        8.64140604,  6.99784206,  0.        ,  5.04847181,  8.05747954,
        1.69102484,  3.7873289 ,  9.10659826,  4.69883925, 14.43026468,
        4.00866309,  3.18426299,  3.80280439,  4.46541659, 23.17180929,
        8.36218177, 22.77160898, 15.17921022, 14.86709023,  3.19614852,
       10.89662847,  1.11058738,  4.2728777 ,  3.92547729, 25.84

In [78]:
# number of times each token appears across all SPAM messages
spam_token_count = nb.feature_count_[1, :]
spam_token_count

array([ 4.13269036,  2.96920949,  3.21125201,  2.53043317,  2.49184802,
        7.74944611,  9.22453557,  2.24867661,  4.16670263,  2.8381051 ,
        2.10974818,  3.33068381,  4.68884643,  7.42250148,  5.44712025,
        3.27239057,  8.08327311,  9.1214979 ,  8.90047905,  5.49214859,
        2.82037243,  8.29462221,  3.63001037,  5.76998147,  1.35961999,
        3.95706367,  1.43303527,  2.15751601,  2.61382896,  3.36734587,
        1.28081428,  7.7794004 ,  5.55755082,  4.2772141 ,  6.06138376,
        5.35071799,  4.20837176,  2.77176492,  1.63620984,  0.        ,
        0.        ,  0.        ,  0.        ,  4.19240013,  0.        ,
        0.        ,  1.02794737,  3.14612993,  0.        ,  0.        ,
        2.58440563,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.04219569,  0.        ,
        0.        ,  0.73880774,  0.        ,  0.        ,  0.        ,
        0.        ,  1.95705487,  1.69695436,  0.35945905,  0.  

In [79]:
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'ham':ham_token_count, 'spam':spam_token_count}).set_index('token')
tokens.sample(5,random_state=3)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
player,1.0,3.180541
smth,4.389889,0.0
voucher,0.321617,4.313726
crave,3.304882,0.0
nt,6.311213,0.252694


In [80]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

array([3617.,  562.])

In [81]:
# add 1 to ham and spam counts to avoid mathematical errors like dividing by zero.
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
player,2.0,4.180541
smth,5.389889,1.0
voucher,1.321617,5.313726
crave,4.304882,1.0
nt,7.311213,1.252694


In [82]:
# convert the ham and spam counts into frequencies
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
player,0.000553,0.007439
smth,0.00149,0.001779
voucher,0.000365,0.009455
crave,0.00119,0.001779
nt,0.002021,0.002229


In [83]:
# calculate the ratio of spam-to-ham for each token
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
player,0.000553,0.007439,13.452863
smth,0.00149,0.001779,1.194077
voucher,0.000365,0.009455,25.876517
crave,0.00119,0.001779,1.495033
nt,0.002021,0.002229,1.102726


In [84]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('spam_ratio', ascending=False)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
claim,0.000276,0.039144,141.584108
prize,0.000276,0.036308,131.326071
guaranteed,0.000276,0.022066,79.814504
tone,0.000276,0.020725,74.962220
1000,0.000276,0.018193,65.804529
urgent,0.000377,0.024760,65.755513
awarded,0.000276,0.017740,64.165923
18,0.000276,0.017617,63.718919
2000,0.000276,0.016538,59.819659
150ppm,0.000276,0.016162,58.459429


In [85]:
# look up the spam_ratio for a given token
tokens.loc['prize', 'spam_ratio']

131.32607149588594

In [88]:
# look up the spam_ratio for a given token
tokens.loc['later', 'spam_ratio']

0.14353410235210196