In [13]:
import pandas as pd

# read file into pandas using a relative path
path = "D:\ML Internship\sms.tsv"
sms = pd.read_table(path, header=None, names=['label', 'message'])

In [14]:
sms.shape

(5572, 2)

In [15]:
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [16]:
# examine the class distribution
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [17]:
# convert label to a numerical variable because some classification algorithms want their classes to be of numeric values as well.
sms['label'] = sms.label.map({'ham':0, 'spam':1})

In [18]:
# check that the conversion worked
sms.head(10)

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [29]:
#Some preprocessing needs to be done before we start extracting features.

# Making everything lower case 
sms['message'] = sms['message'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [30]:
sms.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [31]:
# Removing Punctuation

sms['message'] = sms['message'].str.replace('[^\w\s]','')
sms['message'].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: message, dtype: object

In [33]:
# Removal of Stop Words

stop = set([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fify", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"])

sms['message'] = sms['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
sms['message'].head()

0    jurong point crazy available bugis n great wor...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                          u dun say early hor u c say
4                        nah dont think goes usf lives
Name: message, dtype: object

In [38]:
# how to define X and y (from the SMS data) for use with TF-IDFVectorizer
X = sms.message
y = sms.label
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [39]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000)
train_vect = tfidf.fit_transform(X_train)

train_vect

<4179x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 21189 stored elements in Compressed Sparse Row format>

In [47]:
print(train_vect)

  (0, 373)	0.25020960544618226
  (0, 665)	0.2695693380407228
  (0, 611)	0.26567277329291067
  (0, 483)	0.24648267528252585
  (0, 699)	0.3013485003213593
  (0, 460)	0.2604213876908336
  (0, 144)	0.2588048684308026
  (0, 634)	0.293539180057629
  (0, 317)	0.3625461439462388
  (0, 633)	0.20315310067107908
  (0, 11)	0.31090633587104227
  (0, 893)	0.2839813445079461
  (0, 818)	0.2675767612595955
  (1, 228)	1.0
  (2, 403)	0.43834297968522384
  (2, 577)	0.4093640668127523
  (2, 825)	0.3856928503254032
  (2, 796)	0.4067671147588081
  (2, 892)	0.5710149144478316
  (3, 483)	0.2622839095204448
  (3, 825)	0.20411230960428572
  (3, 796)	0.21526500995401895
  (3, 386)	0.3123570598002142
  (3, 157)	0.2598196042111001
  (3, 667)	0.2992449293468607
  :	:
  (4174, 968)	0.7771364514694908
  (4175, 643)	0.3516168032267615
  (4175, 153)	0.3772610513020591
  (4175, 597)	0.5599213515780264
  (4175, 477)	0.34308839313340744
  (4175, 225)	0.3589622780836287
  (4175, 568)	0.4170902064496864
  (4176, 574)	0.55601

In [48]:
test_vect = tfidf.fit_transform(X_test)

test_vect

<1393x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 7525 stored elements in Compressed Sparse Row format>

In [49]:
print(test_vect)

  (0, 678)	1.0
  (1, 991)	0.39243275625415797
  (1, 681)	0.4796340307539851
  (1, 348)	0.3612353857446583
  (1, 501)	0.4051597970741605
  (1, 546)	0.5668353052538123
  (3, 375)	1.0
  (4, 317)	0.27284344432422053
  (4, 69)	0.23910483730031493
  (4, 797)	0.5823024247271341
  (4, 854)	0.2841909219885142
  (4, 138)	0.2841909219885142
  (4, 205)	0.2249424107031195
  (4, 677)	0.3094589804029136
  (4, 453)	0.2781616442202074
  (4, 932)	0.20017328014646027
  (4, 774)	0.32244854854627325
  (5, 614)	0.33663743673746244
  (5, 860)	0.48187871585210523
  (5, 352)	0.36238368597491805
  (5, 930)	0.4527143672875936
  (5, 174)	0.359099664828686
  (5, 936)	0.43502107524361167
  (6, 932)	0.4657330018134129
  (6, 892)	0.6965603537726783
  :	:
  (1390, 604)	0.11115140797720427
  (1390, 847)	0.10358366160023497
  (1390, 853)	0.11707060859101708
  (1390, 747)	0.1212873227046971
  (1390, 329)	0.15851023632934425
  (1390, 484)	0.13731835902602335
  (1390, 542)	0.9510614179760655
  (1391, 548)	0.234249605307061

In [50]:
# Now we have processed the text to make it useful for running further classification or Regression algorithms

# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [54]:
# train the model using train_vect (timing it with an IPython "magic command")
%time nb.fit(train_vect, y_train)

Wall time: 3.97 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [55]:
# make class predictions for test_vect
y_pred_class = nb.predict(test_vect)

In [56]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.8657573582196698

In [57]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1127,   81],
       [ 106,   79]], dtype=int64)

In [58]:
# print message text for the false positives (ham incorrectly classified as spam)
X_test[(y_pred_class==1) & (y_test==0)]
#or more elegant way is to use X_test[y_pred_class > y_test]

4674    forgot 2 ask ü smth theres card da present lei...
3224                                           pain catch
3214                                         whats ur pin
1272    havent collected dough pls let know place sent...
3242           ok ive sent u da latest version da project
2849    sad story man week bday wife didnt wish parent...
2262                           did wot did say u c 4 dust
407     slightly disastrous class pm fav darlings hope...
1074    lul im gettin juicy gossip hospital nurses tal...
4804                                          plan manage
2198                                            bring got
4950                                      bus way calicut
4201                                     come tomorrow di
1016    dearregret cudnt pick calldrove frm ctla cochi...
5490                                               k sent
996                                    change e escalator
4419                                                 free
4511          

In [59]:
# print message text for the false negatives (spam incorrectly classified as ham)
X_test[(y_pred_class==0) & (y_test==1)]
#or more elegant way is to use 
#X_test[y_pred_class < y_test]

147     freemsg havent replied text im randy sexy fema...
4517    congrats 2 mobile 3g videophones r 09061744553...
3316    free message activate 500 free text messages r...
1745    conacted dating service entered phone fancy yo...
1064    new local dates area lots new people registere...
1687    todays vodafone numbers ending 0089my digits s...
3642    stop club tones replying stop mix mytonecomenj...
4460    welcome ukmobiledate msg free giving free call...
2680    new tones week include 1mcflyall ab 2 sara jor...
1217                          1 new voicemail 08719181513
3766    u know asked dating service 2 contact guess 09...
763     urgent ur 500 guaranteed award unclaimed 09066...
3780    claim 200 shopping spree just 08717895698 won ...
881     reminder downloaded content paid goto httpdoit...
4376    ur tonexs subscription renewed charged 450 cho...
3132    lookatme thanks purchase video clip lookatme y...
3819    74355 xmas iscoming ur awarded 500 cd gift vou...
2954    urgent

In [61]:
# Comparing models
#We will compare multinomial Naive Bayes with logistic regression:

# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [62]:
# train the model using X_train_dtm
%time logreg.fit(train_vect, y_train)

Wall time: 976 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [63]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(test_vect)

In [64]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.8700646087580761

In [65]:
# Examining a model for further insight
#We will examine the our trained Naive Bayes model to calculate the approximate "spamminess" of each token.

In [66]:
# store the vocabulary of X_train
X_train_tokens = tfidf.get_feature_names()
len(X_train_tokens)

1000

In [67]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

['020603', '0800', '08000839402', '08000930705', '08718720201', '09050090044', '10', '100', '1000', '11', '150', '150p', '150pmin', '150pmsg', '150ppm', '16', '18', '1st', '200', '2000', '2004', '250', '2nd', '300', '350', '50', '500', '5000', '530', '750', '83600', '85023', '86688', '87066', '87077', '87575', '9ja', 'abi', 'abiola', 'able', 'abt', 'access', 'account', 'actually', 'ad', 'add', 'address', 'affection', 'aft', 'afternoon']


In [68]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

['whats', 'whos', 'wid', 'wif', 'wife', 'wil', 'win', 'winner', 'wish', 'wishing', 'wit', 'wiv', 'wk', 'wkly', 'wnt', 'won', 'wonderful', 'wondering', 'wont', 'word', 'words', 'work', 'workin', 'working', 'world', 'worries', 'worry', 'worth', 'wot', 'wow', 'wrong', 'wun', 'xmas', 'xx', 'xxx', 'ya', 'yahoo', 'yar', 'yeah', 'year', 'years', 'yes', 'yesterday', 'yo', 'yogasana', 'youll', 'youre', 'yr', 'yun', 'yup']


In [69]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_

array([[ 0.        ,  0.        ,  0.        , ...,  1.28488782,
        17.13269784,  0.        ],
       [ 4.09705418,  3.01174689,  3.25113176, ...,  1.10779222,
         0.        ,  1.6755781 ]])

In [70]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape

(2, 1000)

In [71]:
# number of times each token appears across all HAM messages
ham_token_count = nb.feature_count_[0, :]
ham_token_count

array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   3.01694982,   0.63882256,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         1.73397805,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.61245323,   0.        ,
         3.2604561 ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.38325204,   3.30694568,
         2.31926667,   0.        ,   0.        ,   0.        ,
         0.        ,   0.93232535,   0.        ,   0.        ,
         1.81581153,   2.06629077,   1.74515351,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         1.29387818,   3.55126821,   7.98259779,  11.14594291,
         3.26544184,   4.80360426,   9.03084616,   7.03343885,
         0.        ,   4.98446104,   8.28276902,   1.72908164,
         3.84509478,   9.31394708,   4.82196578,  15.66

In [72]:
# number of times each token appears across all SPAM messages
spam_token_count = nb.feature_count_[1, :]
spam_token_count

array([ 4.09705418,  3.01174689,  3.25113176,  2.07467781,  2.56477936,
        2.59625352,  7.90440729,  9.28381092,  2.27642246,  4.22306018,
        2.81134806,  2.13058597,  3.45226584,  4.70301592,  7.7477488 ,
        5.7488597 ,  3.337602  ,  8.07537499,  9.41257991,  9.10607985,
        5.51048768,  3.00354357,  8.21356426,  3.6665166 ,  5.92006528,
        2.07072364,  1.50821818,  3.89003729,  1.50737513,  2.2163558 ,
        2.6370755 ,  3.5039463 ,  1.52406548,  1.53802172,  1.87686297,
        2.11511878,  0.        ,  1.30488058,  1.30103813,  7.67533735,
        5.46056338,  2.07467781,  2.56209584,  4.50082487,  6.06878318,
        4.91630454,  3.96551476,  2.82782797,  1.63823391,  0.        ,
        0.        ,  0.        ,  0.        ,  4.22448396,  0.        ,
        1.086267  ,  3.18562342,  0.        ,  0.        ,  2.87460694,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [76]:
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'ham':ham_token_count, 'spam':spam_token_count}).set_index('token')
tokens.sample(5,random_state=3)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
pay,16.873459,1.268002
shirt,15.40147,0.381428
urgent,0.0,3.507483
credit,1.225454,2.445105
neva,4.193493,0.0


In [77]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

array([3617.,  562.])

In [81]:
# add 1 to ham and spam counts to avoid mathematical errors like dividing by zero.
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pay,1.004942,1.004036,0.816671
shirt,1.004535,1.002458,0.542073
urgent,1.000276,1.00802,29.009902
credit,1.000615,1.00613,9.963136
neva,1.001436,1.001779,1.239232


In [82]:
# convert the ham and spam counts into frequencies
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pay,0.000278,0.001787,0.816671
shirt,0.000278,0.001784,0.542073
urgent,0.000277,0.001794,29.009902
credit,0.000277,0.00179,9.963136
neva,0.000277,0.001783,1.239232


In [83]:
# calculate the ratio of spam-to-ham for each token
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pay,0.000278,0.001787,6.430141
shirt,0.000278,0.001784,6.422639
urgent,0.000277,0.001794,6.485769
credit,0.000277,0.00179,6.471414
neva,0.000277,0.001783,6.438151


In [84]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('spam_ratio', ascending=False)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
forgot,0.000278,0.001885,6.787591
tonite,0.000277,0.001859,6.713140
sms,0.000277,0.001856,6.691298
cleaning,0.000277,0.001850,6.689475
mobile,0.000277,0.001851,6.687063
subscription,0.000278,0.001855,6.677039
poly,0.000277,0.001843,6.665893
reach,0.000277,0.001844,6.652078
wish,0.000277,0.001829,6.612664
sending,0.000277,0.001827,6.603201


In [85]:
# look up the spam_ratio for a given token
tokens.loc['prize', 'spam_ratio']

6.475553514607247

In [89]:
# look up the spam_ratio for a given token
tokens.loc['doing', 'spam_ratio']

6.422157706264464