In [1]:
import pandas as pd

# read file into pandas using a relative path
path = "D:\ML Internship\sms.tsv"
sms = pd.read_table(path, header=None, names=['label', 'message'])

In [2]:
sms.shape

(5572, 2)

In [3]:
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
# examine the class distribution
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
# convert label to a numerical variable because some classification algorithms want their classes to be of numeric values as well.
sms['label'] = sms.label.map({'ham':0, 'spam':1})

In [6]:
# check that the conversion worked
sms.head(10)

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [7]:
#Some preprocessing needs to be done before we start extracting features.

# Making everything lower case 
sms['message'] = sms['message'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [8]:
sms.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [9]:
# Removing Punctuation

sms['message'] = sms['message'].str.replace('[^\w\s]','')
sms['message'].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: message, dtype: object

In [10]:
# Removal of Stop Words

stop = set([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fify", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"])

sms['message'] = sms['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
sms['message'].head()

0    jurong point crazy available bugis n great wor...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                          u dun say early hor u c say
4                        nah dont think goes usf lives
Name: message, dtype: object

In [11]:
# how to define X and y (from the SMS data) for use with TF-IDFVectorizer
X = sms.message
y = sms.label
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [28]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
train_vect = tfidf.fit_transform(X_train)

In [41]:
train_vect

<4179x7835 sparse matrix of type '<class 'numpy.float64'>'
	with 32128 stored elements in Compressed Sparse Row format>

In [42]:
test_vect = tfidf.transform(X_test)

In [43]:
test_vect

<1393x7835 sparse matrix of type '<class 'numpy.float64'>'
	with 9414 stored elements in Compressed Sparse Row format>

In [44]:
# Now we have processed the text to make it useful for running further classification or Regression algorithms

# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [45]:
# train the model using train_vect (timing it with an IPython "magic command")
%time nb.fit(train_vect, y_train)

Wall time: 0 ns


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [47]:
# make class predictions for test_vect
y_pred_class = nb.predict(test_vect)

In [48]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9662598707824839

In [49]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1208,    0],
       [  47,  138]], dtype=int64)

In [52]:
# print message text for the false positives (ham incorrectly classified as spam)
X_test[(y_pred_class==1) & (y_test==0)]
#or more elegant way is to use X_test[y_pred_class > y_test]

# There are absolutely no false positives in this case.

Series([], Name: message, dtype: object)

In [53]:
# print message text for the false negatives (spam incorrectly classified as ham)
X_test[(y_pred_class==0) & (y_test==1)]
#or more elegant way is to use 
#X_test[y_pred_class < y_test]

147     freemsg havent replied text im randy sexy fema...
1064    new local dates area lots new people registere...
4460    welcome ukmobiledate msg free giving free call...
2680    new tones week include 1mcflyall ab 2 sara jor...
1217                          1 new voicemail 08719181513
881     reminder downloaded content paid goto httpdoit...
4376    ur tonexs subscription renewed charged 450 cho...
3132    lookatme thanks purchase video clip lookatme y...
2295                            1 new message 08718738034
420     send logo 2 ur lover 2 names joined heart txt ...
5110                            1 new message 08715205273
1045    know know fancies 09058097218 pobox 6 ls15hb 150p
4965    dear voucher holder meal use following link pc...
3443    save money wedding lingerie wwwbridalpetticoat...
2583    3 free tarot texts love life try 3 free text c...
943     getting touch folks waiting company just txt a...
5       freemsg hey darling 3 weeks word id like fun t...
3856    free m

In [54]:
# Comparing models
#We will compare multinomial Naive Bayes with logistic regression:

# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [55]:
# train the model using X_train_dtm
%time logreg.fit(train_vect, y_train)

Wall time: 434 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [56]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(test_vect)

In [57]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.9633883704235463

In [58]:
# Examining a model for further insight
#We will examine the our trained Naive Bayes model to calculate the approximate "spamminess" of each token.

In [59]:
# store the vocabulary of X_train
X_train_tokens = tfidf.get_feature_names()
len(X_train_tokens)

7835

In [60]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

['008704050406', '0121', '01223585236', '01223585334', '0125698789', '02', '020603', '0207', '02070836089', '02072069400', '02073162414', '02085076972', '020903', '021', '050703', '0578', '06', '061104', '07008009200', '07090201529', '07090298926', '071104', '07123456789', '07732584351', '07734396839', '07742676969', '0776xxxxxxx', '07786200117', '078', '07801543489', '07808', '07808247860', '07808726822', '07815296484', '07821230901', '0789xxxxxxx', '0794674629107880867867', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '0800', '08000407165', '08000776320', '08000839402', '08000930705', '08000938767', '08001950382', '08002888812', '08002986030', '08002986906']


In [61]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

['yijue', 'ym', 'ymca', 'yo', 'yoga', 'yogasana', 'yohere', 'yor', 'yorge', 'youcarlos', 'youclean', 'youd', 'youdearwith', 'youdoing', 'youhow', 'youi', 'youkwhere', 'youll', 'youmy', 'youphone', 'youre', 'yourjob', 'youve', 'youwanna', 'youwhen', 'yowifes', 'yoyyooo', 'yr', 'yrs', 'yummmm', 'yummy', 'yun', 'yunny', 'yuo', 'yuou', 'yup', 'ywhere', 'zac', 'zahers', 'zealand', 'zebra', 'zed', 'zeros', 'zhong', 'zindgi', 'zoe', 'zoom', 'zouk', 'zyada', 'üll']


In [62]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_

array([[0.        , 0.        , 0.        , ..., 0.        , 0.16482465,
        0.92584638],
       [0.5706699 , 0.36224523, 0.34893353, ..., 0.29885538, 0.        ,
        0.        ]])

In [63]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape

(2, 7835)

In [64]:
# number of times each token appears across all HAM messages
ham_token_count = nb.feature_count_[0, :]
ham_token_count

array([0.        , 0.        , 0.        , ..., 0.        , 0.16482465,
       0.92584638])

In [65]:
# number of times each token appears across all SPAM messages
spam_token_count = nb.feature_count_[1, :]
spam_token_count

array([0.5706699 , 0.36224523, 0.34893353, ..., 0.29885538, 0.        ,
       0.        ])

In [66]:
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'ham':ham_token_count, 'spam':spam_token_count}).set_index('token')
tokens.sample(5,random_state=3)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
600,0.0,0.588408
outbid,0.0,0.270828
worms,0.462322,0.0
wknd,0.425854,0.0
181104,0.0,0.30426


In [67]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

array([3617.,  562.])

In [68]:
# add 1 to ham and spam counts to avoid mathematical errors like dividing by zero.
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
600,1.0,1.588408
outbid,1.0,1.270828
worms,1.462322,1.0
wknd,1.425854,1.0
181104,1.0,1.30426


In [69]:
# convert the ham and spam counts into frequencies
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
600,0.000276,0.002826
outbid,0.000276,0.002261
worms,0.000404,0.001779
wknd,0.000394,0.001779
181104,0.000276,0.002321


In [70]:
# calculate the ratio of spam-to-ham for each token
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
600,0.000276,0.002826,10.222905
outbid,0.000276,0.002261,8.178978
worms,0.000404,0.001779,4.40118
wknd,0.000394,0.001779,4.513745
181104,0.000276,0.002321,8.394141


In [71]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('spam_ratio', ascending=False)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
claim,0.000276,0.031274,113.118150
prize,0.000276,0.029137,105.388365
won,0.000276,0.022302,80.666015
guaranteed,0.000276,0.018118,65.531128
urgent,0.000336,0.020627,61.336383
1000,0.000276,0.015183,54.915120
tone,0.000276,0.015131,54.729172
2000,0.000276,0.014180,51.290601
18,0.000276,0.013824,50.001915
awarded,0.000276,0.013617,49.251116


In [72]:
# look up the spam_ratio for a given token
tokens.loc['prize', 'spam_ratio']

105.38836461753262

In [73]:
# look up the spam_ratio for a given token
tokens.loc['doing', 'spam_ratio']

0.2921303768365546