# Building a Spam Filter with Naive Bayes

In [1]:
import pandas as pd

sms_spam = pd.read_csv('SMSSpamCollection', sep='\t', header=None, 
                      names=['Label', 'SMS'])

print(sms_spam.shape)
print(sms_spam.head())
len(sms_spam[sms_spam.Label == 'spam']) / len(sms_spam) *100

(5572, 2)
  Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


13.406317300789663

# Training and Test Set

In [2]:
data_randomizing = sms_spam.sample(frac=1, random_state=1)
data_randomizing.head()

Unnamed: 0,Label,SMS
1078,ham,"Yep, by the pretty sculpture"
4028,ham,"Yes, princess. Are you going to make me moan?"
958,ham,Welp apparently he retired
4642,ham,Havent.
4674,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [3]:
training_set_index = round(len(data_randomizing) * 0.8)

training_set = data_randomizing[:training_set_index].reset_index(drop=True)
test_set = data_randomizing[training_set_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [4]:
print(training_set['Label'].value_counts(normalize=True))
print(test_set['Label'].value_counts(normalize=True))

ham     0.86541
spam    0.13459
Name: Label, dtype: float64
ham     0.868043
spam    0.131957
Name: Label, dtype: float64


トレーニングセットとテストセットのspamとhamの比率は元のデータセットとほとんど等しい。

# Data Cleaning

# Letter Case and Punctuation

`SMS`の項目から句読点を全て取り除き、文字を全て小文字に変える。

In [5]:
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')
training_set['SMS'] = training_set['SMS'].str.lower()
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [6]:
test_set['SMS'] = test_set['SMS'].str.replace('\W', ' ')
test_set['SMS'] = test_set['SMS'].str.lower()
test_set.head()

Unnamed: 0,Label,SMS
0,ham,later i guess i needa do mcat study too
1,ham,but i haf enuff space got like 4 mb
2,spam,had your mobile 10 mths update to latest oran...
3,ham,all sounds good fingers makes it difficult ...
4,ham,all done all handed in don t know if mega sh...


# Creating the Vocabulary

In [7]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for row in training_set['SMS']:
    for word in row:
        vocabulary.append(word)
        
vocabulary = set(vocabulary)
vocabulary = list(vocabulary)

In [8]:
len(vocabulary)

7783

In [9]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS'])
                      for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [10]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,0,00,000,000pes,008704050406,0089,01223585334,02,0207,02072069400,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [11]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


# Calculating Constants First

In [12]:
p_spam = training_set_clean['Label'].value_counts(normalize=1)['spam']
p_ham = training_set_clean['Label'].value_counts(normalize=1)['ham']

print(p_spam)
print(p_ham)
print(p_spam + p_ham)

0.13458950201884254
0.8654104979811574
1.0


In [13]:
n_vocabulary = len(vocabulary)

n_word_per_spam_message = training_set_clean[training_set_clean['Label'] == 'spam']['SMS'].apply(len)
n_spam = n_word_per_spam_message.sum()

n_word_per_not_spam_message = training_set_clean[training_set_clean['Label'] == 'ham']['SMS'].apply(len)
n_ham = n_word_per_not_spam_message.sum()

alpha = 1

print(n_vocabulary)
print(n_spam)
print(n_ham)

7783
15190
57237


# Calculating Parameters

In [14]:
spam_parameters = {word: 0 for word in vocabulary}
ham_parameters = {word: 0 for word in vocabulary}

spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

In [23]:
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum()
    n_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    spam_parameters[word] = n_word_given_spam
    
    n_word_given_ham = ham_messages[word].sum()
    n_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    ham_parameters[word] = n_word_given_ham

# Classifying A New Message

In [32]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in spam_parameters:
            p_spam_given_message *= spam_parameters[word]
        if word in ham_parameters:
            p_ham_given_message *= ham_parameters[word]
        

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [33]:
#test
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [34]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


# Measuring the Spam Filter's Accuracy

In [39]:
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in spam_parameters:
            p_spam_given_message *= spam_parameters[word]

        if word in ham_parameters:
            p_ham_given_message *= ham_parameters[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [40]:
correct = 0
total = len(test_set)

test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,later i guess i needa do mcat study too,ham
1,ham,but i haf enuff space got like 4 mb,ham
2,spam,had your mobile 10 mths update to latest oran...,spam
3,ham,all sounds good fingers makes it difficult ...,ham
4,ham,all done all handed in don t know if mega sh...,ham


In [42]:
for key, row in test_set.iterrows():
    if row['Label'] == row['predicted']:
        correct += 1

In [48]:
print('Correct:',correct)
print('Incorrect:',total - correct)
print('Accuracy:', correct/total * 100)

Correct: 1100
Incorrect: 14
Accuracy: 98.74326750448833


精度はおよそ98.74％であり、非常に高い。1114個のテストメッセージのうち、1100個を正しく分類することができた。