In [20]:
import pandas as pd
import numpy as np

## Spam Detector

In [11]:
path = 'sms.csv'
data = pd.read_csv(path, sep='\t', header=None, names=['label', 'message']) # use sep, header, and names to format data 
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
hamCount, spamCount = data.label.value_counts() # get counts of ham and spam

4825

In [58]:
random = np.random.randint(spamCount)
random
data[data.label == 'spam'].iloc[random].message # look at a random spam message

'Would you like to see my XXX pics they are so hot they were nearly banned in the uk!'

In [102]:
random = np.random.randint(hamCount)
random
data[data.label == 'ham'].iloc[random].message # look at a random ham message

'i thought we were doing a king of the hill thing there.'

## Naive Bayes
- Based on Bayes theorem

### Setting
- labels, 1,2,k. In this case 1=ham, 2=spam
- data: n documents
- vocabulary: w11,w21,...wd
- For each document: x = [n1,n2,n3,...nd) 'count vector'
- nj = number of times wi appears

In [103]:
# p(label=ham)
hamCount / (hamCount+spamCount)

0.8659368269921034

In [104]:
# p(label=spam)
spamCount / (hamCount+spamCount)

0.13406317300789664

### Prediction rule:
- predict the label with the largest probability
- p(label=1 | x)
- p(label=2 | x)
- ...
- p(label=k | x)

In [107]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

In [109]:
pipe = Pipeline(steps=[
    ('vect', CountVectorizer(max_features=1000)),
    ('clf', MultinomialNB()) # can take alpha parameter
])

In [111]:
X = data.message
y = data.label

In [113]:
# split data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [114]:
# fit pipe
pipe.fit(X_train,y_train)


In [115]:
# evaluate the pipeline
y_test_pred = pipe.predict(X_test)

In [116]:
confusion_matrix(y_test,y_test_pred)

array([[1206,   16],
       [  10,  161]], dtype=int64)

## How does Naive Bayes choose between ham and spam?

In [145]:
# store vocabulary
words = pipe['vect'].get_feature_names_out()

In [146]:
pipe['clf'].classes_

array(['ham', 'spam'], dtype='<U4')

In [147]:
# NB counts number of times each word appears in each class
pipe['clf'].feature_count_

array([[ 0.,  0.,  0., ..., 18.,  3., 32.],
       [10., 23., 13., ...,  1.,  7.,  0.]])

In [148]:
#
ham_count = pipe['clf'].feature_count_[0]
spam_count = pipe['clf'].feature_count_[1]

In [151]:
# create a dataframe
df = pd.DataFrame({'words':words, 'ham':ham_count, 'spam':spam_count}).set_index('words')
# add 1 to the columns to avoid dividing by 0
df.spam = df.spam +1
df.ham = df.ham+1

In [152]:
df.tail(50)

Unnamed: 0_level_0,ham,spam
words,Unnamed: 1_level_1,Unnamed: 2_level_1
wife,20.0,1.0
wil,15.0,1.0
will,250.0,34.0
win,9.0,41.0
wine,10.0,1.0
winner,1.0,13.0
wish,30.0,3.0
wit,12.0,1.0
with,197.0,87.0
within,7.0,6.0


In [153]:
# convert counts into frequencies
df.spam = df.spam/df.spam.sum()
df.ham = df.ham/df.ham.sum()

In [158]:
# calculate ratio of ham to spam and spam to ham for each word
df['ham_ratio'] = df.ham/df.spam
df['spam_ratio'] = df.spam/df.ham

In [159]:
# top 20 spam words
df.sort_values(by='spam_ratio', ascending=False).head(20)

Unnamed: 0_level_0,ham,spam,ham_ratio,spam_ratio
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claim,2.6e-05,0.008221,0.003152,317.268877
prize,2.6e-05,0.005987,0.004328,231.054508
150p,2.6e-05,0.004647,0.005576,179.325887
tone,2.6e-05,0.00411,0.006304,158.634438
16,2.6e-05,0.003842,0.006744,148.288714
www,5.2e-05,0.007238,0.00716,139.667277
18,2.6e-05,0.003574,0.007249,137.94299
500,2.6e-05,0.003396,0.007631,131.04584
guaranteed,2.6e-05,0.003128,0.008285,120.700116
1000,2.6e-05,0.002949,0.008787,113.802967


In [160]:
# top 20 ham words
df.sort_values(by='ham_ratio', ascending=False).head(20)

Unnamed: 0_level_0,ham,spam,ham_ratio,spam_ratio
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gt,0.006089,8.9e-05,68.144093,0.014675
lt,0.006063,8.9e-05,67.854119,0.014737
he,0.004638,8.9e-05,51.905501,0.019266
she,0.003446,8.9e-05,38.566657,0.025929
lor,0.003368,8.9e-05,37.696733,0.026527
da,0.003161,8.9e-05,35.376934,0.028267
later,0.002669,8.9e-05,29.867411,0.033481
much,0.002384,8.9e-05,26.677688,0.037485
come,0.004457,0.000179,24.937838,0.0401
too,0.002228,8.9e-05,24.937838,0.0401
