In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r'C:\Users\vimalson\Downloads\Datasets\SmsSpamCollection',sep='\t',names=['label','message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null object
message    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [6]:
df['label_numbers'] = df['label'].map({'ham':0,'spam':1})

In [7]:
df.head()

Unnamed: 0,label,message,label_numbers
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = df['message']

In [10]:
y= df['label_numbers']

In [11]:
X.shape,y.shape

((5572,), (5572,))

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=20)

In [13]:
X_train.shape,X_test.shape

((3900,), (1672,))

In [14]:
#CountVectorizer 

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
vector = CountVectorizer()

In [17]:
X_train_dtmatrix = vector.fit_transform(X_train)

In [18]:
print(X_train_dtmatrix.shape)

(3900, 7246)


In [19]:
X_train_dtmatrix

<3900x7246 sparse matrix of type '<class 'numpy.int64'>'
	with 51979 stored elements in Compressed Sparse Row format>

In [20]:
X_test_dtmatrix = vector.transform(X_test)

In [21]:
X_test_dtmatrix 

<1672x7246 sparse matrix of type '<class 'numpy.int64'>'
	with 20552 stored elements in Compressed Sparse Row format>

In [22]:
from sklearn.naive_bayes import MultinomialNB

In [23]:
model = MultinomialNB()

In [24]:
model.fit(X_train_dtmatrix,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
prediction = model.predict(X_test_dtmatrix)

In [26]:
from sklearn import metrics

In [27]:
metrics.accuracy_score(y_test,prediction)

0.9784688995215312

In [28]:
print(metrics.confusion_matrix(y_test,prediction))

[[1420   16]
 [  20  216]]


In [29]:
#Let's use LogisticRegression model for Comparison

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
model2 = LogisticRegression()

In [32]:
model2.fit(X_train_dtmatrix,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
prediction2 = model2.predict(X_test_dtmatrix)

In [34]:
print(metrics.accuracy_score(y_test,prediction2))

0.9802631578947368


In [35]:
print(metrics.confusion_matrix(y_test,prediction2))

[[1432    4]
 [  29  207]]


In [36]:
X_test[y_test < prediction2]##ham:0,spam:1

1260    We have sent JD for Customer Service cum Accou...
700              K..u also dont msg or reply to his msg..
5046    We have sent JD for Customer Service cum Accou...
4729    I (Career Tel) have added u as a contact on IN...
Name: message, dtype: object

In [37]:
X_test[y_test > prediction2]

1274    network operator. The service is free. For T &...
5466    http//tms. widelive.com/index. wml?id=820554ad...
731     Email AlertFrom: Jeri StewartSize: 2KBSubject:...
3302             RCT' THNQ Adrian for U text. Rgds Vatian
227     Will u meet ur dream partner soon? Is ur caree...
2558    This message is brought to you by GMW Ltd. and...
4256    Block Breaker now comes in deluxe format with ...
4394    RECPT 1/3. You have ordered a Ringtone. Your o...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
3991    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
4949    Hi this is Amy, we will be sending you a free ...
2823    ROMCAPspam Everyone around should be respondin...
4473      3. You have received your mobile content. Enjoy
3750    Dear Voucher Holder 2 claim your 1st class air...
2965    Do you ever notice that when you're driving, a...
5       FreeMsg Hey there darling it's been 3 week's n...
2248    Back 2 work 2morro half term over! Can U C me ...
2663    Hello 

In [47]:
#to find out commom spam and ham words

In [39]:
words = vector.get_feature_names()

In [40]:
len(words)

7246

In [43]:
words[:10]

['00',
 '000',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '0125698789',
 '02',
 '0207',
 '02072069400']

In [48]:
#to measure how many times certain words appear

In [45]:
#feature_count_

In [46]:
model.feature_count_

array([[ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 8., 17.,  1., ...,  0.,  1.,  1.]])

In [50]:
model.feature_count_.shape

(2, 7246)

In [49]:
ham_word_count = model.feature_count_[0,:]

In [51]:
spam_word_count = model.feature_count_[1,:]

In [52]:
spam_word_count

array([ 8., 17.,  1., ...,  0.,  1.,  1.])

In [54]:
#let us create a DataFrame for comparison

In [56]:
tokens = pd.DataFrame({'token':words,'ham':ham_word_count,'spam':spam_word_count})

In [57]:
tokens

Unnamed: 0,token,ham,spam
0,00,0.0,8.0
1,000,0.0,17.0
2,008704050406,0.0,1.0
3,0089,0.0,1.0
4,0121,0.0,1.0
...,...,...,...
7241,zoe,0.0,1.0
7242,zogtorius,1.0,0.0
7243,zoom,1.0,0.0
7244,zouk,0.0,1.0


In [58]:
tokens.set_index('token',inplace = True)

In [59]:
tokens

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
00,0.0,8.0
000,0.0,17.0
008704050406,0.0,1.0
0089,0.0,1.0
0121,0.0,1.0
...,...,...
zoe,0.0,1.0
zogtorius,1.0,0.0
zoom,1.0,0.0
zouk,0.0,1.0


In [60]:
tokens['ham'] = tokens.ham + 10

In [61]:
tokens['spam'] = tokens.spam + 10

In [62]:
tokens

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
00,10.0,18.0
000,10.0,27.0
008704050406,10.0,11.0
0089,10.0,11.0
0121,10.0,11.0
...,...,...
zoe,10.0,11.0
zogtorius,11.0,10.0
zoom,11.0,10.0
zouk,10.0,11.0


In [63]:
from __future__ import division

In [65]:
tokens['ham'] = tokens.ham/model.class_count_[0]

In [66]:
tokens['spam'] = tokens.spam/model.class_count_[1]

In [68]:
tokens.head()

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.002951,0.035225
0,0.002951,0.052838
8704050406,0.002951,0.021526
89,0.002951,0.021526
121,0.002951,0.021526


In [69]:
tokens['ham_to_spam_ratio'] = tokens.ham/tokens.spam

In [70]:
tokens.head()

Unnamed: 0_level_0,ham,spam,ham_to_spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.002951,0.035225,0.083768
0,0.002951,0.052838,0.055845
8704050406,0.002951,0.021526,0.137074
89,0.002951,0.021526,0.137074
121,0.002951,0.021526,0.137074


In [71]:
tokens.sort_values('ham_to_spam_ratio',ascending = False,inplace=True)

In [72]:
tokens

Unnamed: 0_level_0,ham,spam,ham_to_spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
my,0.163765,0.035225,4.649110
gt,0.071998,0.019569,3.679079
lt,0.070817,0.019569,3.618767
but,0.095603,0.031311,3.053334
me,0.164355,0.056751,2.896053
...,...,...,...
nokia,0.003246,0.125245,0.025916
txt,0.006197,0.246575,0.025130
www,0.003246,0.142857,0.022721
prize,0.002951,0.156556,0.018848


In [73]:
tokens['spam_to_ham_ratio'] = tokens.spam/tokens.ham

In [74]:
tokens.sort_values('spam_to_ham_ratio',ascending=False,inplace=True)

In [75]:
tokens

Unnamed: 0_level_0,ham,spam,ham_to_spam_ratio,spam_to_ham_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claim,0.002951,0.176125,0.016754,59.688845
prize,0.002951,0.156556,0.018848,53.056751
www,0.003246,0.142857,0.022721,44.012987
txt,0.006197,0.246575,0.025130,39.792564
nokia,0.003246,0.125245,0.025916,38.586728
...,...,...,...,...
me,0.164355,0.056751,2.896053,0.345298
but,0.095603,0.031311,3.053334,0.327511
lt,0.070817,0.019569,3.618767,0.276337
gt,0.071998,0.019569,3.679079,0.271807
