Building a Spam Filter with Naive Bayes

In [1]:
import pandas as pd

sms = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

sms.shape

(5572, 2)

In [2]:
sms

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
nr_ham = sms[sms['Label'] == 'ham'].shape[0]
nr_messages = sms.shape[0]

pct_ham = nr_ham/nr_messages*100
pct_spam = 100 - pct_ham
print('% Ham  : ', pct_ham)
print('% Spam : ', pct_spam)

% Ham  :  86.59368269921033
% Spam :  13.406317300789667


In [4]:
sms_random = sms.sample(frac=1,random_state=1)
eighty_pct = round(0.8*len(sms_random))

training_set = sms_random[:eighty_pct]
test_set = sms_random[eighty_pct:]

training_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

In [5]:
nr_ham_training = training_set[training_set['Label'] == 'ham'].shape[0]
nr_ham_test = test_set[test_set['Label'] == 'ham'].shape[0]

nr_messages_training = training_set.shape[0]
nr_messages_test = test_set.shape[0]

pct_ham_training = nr_ham_training/nr_messages_training*100
pct_spam_training = 100 - pct_ham_training

pct_ham_test = nr_ham_test/nr_messages_test*100
pct_spam_test = 100 - pct_ham_test
    
print('Training set ham % : ' + str(pct_ham_training))
print('Test set ham %     : ' + str(pct_ham_test))

Training set ham % : 86.54104979811575
Test set ham %     : 86.80430879712748


In [6]:
pd.options.mode.chained_assignment = None

training_set['SMS'] = training_set['SMS'].str.replace('\W',' ').str.lower().str.split().copy()

pd.options.mode.chained_assignment = 'warn'

In [7]:
vocab = set()

for message in training_set['SMS']:
    for word in message:
        if word not in vocab:
            vocab.add(word)
            
vocab = list(vocab)

In [8]:
word_counts_per_sms = {word: [0] * len(training_set['SMS']) for word in vocab}
    
for index, message in enumerate(training_set['SMS']):
    for word in message:
        word_counts_per_sms[word][index] += 1
        
word_counts_per_sms = pd.DataFrame(word_counts_per_sms)

In [9]:
training_set_word_count = pd.concat([training_set, word_counts_per_sms], axis=1)

In [10]:
training_ham = training_set_word_count[training_set_word_count['Label'] == 'ham']
training_spam = training_set_word_count[training_set_word_count['Label'] == 'spam']

p_ham = pct_ham_training/100
p_spam = 1 - p_ham

n_ham = training_ham['SMS'].apply(len).sum()
n_spam = training_spam['SMS'].apply(len).sum()
        
n_vocab = len(vocab)

alpha = 1

In [13]:
p_wi_ham = {word:0 for word in vocab}
p_wi_spam = {word:0 for word in vocab}

for key in p_wi_ham:
    n_wi_ham = training_ham[key].sum()
    p_wi_ham[key] = (n_wi_ham + alpha)/(n_ham + alpha * n_vocab)
    
for key in p_wi_spam:
    n_wi_spam = training_spam[key].sum()
    p_wi_spam[key] = (n_wi_spam + alpha)/(n_spam + alpha * n_vocab)

In [14]:
import re

def classify(message):
    
    re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_ham_given_message = p_ham
    p_spam_given_message = p_spam
    
    for word in message:
        if word in p_wi_ham:
            p_ham_given_message *= p_wi_ham[word]
        if word in p_wi_spam:
            p_spam_given_message *= p_wi_spam[word]
    
    print('P(Ham|message):', p_ham_given_message)
    print('P(Spam|message):', p_spam_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham!')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam!')
    else:
        print('equal probabilities, have a human classify this!')

In [15]:
test_message_1 = 'WINNER!! This is the secret code to unlock the money: C3421.' #Spam message
test_message_2 = 'Sounds good, Tom, then see u there' #Ham message

classify(test_message_1)
classify(test_message_2)

P(Ham|message): 1.8195638182330268e-19
P(Spam|message): 1.016409798170896e-18
Label: Spam!
P(Ham|message): 2.808901827397699e-14
P(Spam|message): 5.359472501724848e-18
Label: Ham!
