In [1]:
'''
Author: Waheed Zarif
Date: 04/21/2020

This is a simple spam filter using multinomial Naive Bayes algorithm
with a dataset of 5,572 SMS messages that are already labeled by humans.
Dataset: https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
Description: http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/#composition
'''



In [1]:
#some prep work...
import pandas as pd
import re

data = pd.read_csv('SMSSpamCollection', sep='\t', 
                  header=None, names=['Label', 'SMS'])
shape = data.shape
total_rows = len(data)
pct_ham = (len(data[data['Label']=='ham'])/total_rows)*100
pct_spam = (len(data[data['Label']=='spam'])/total_rows)*100


In [2]:
print('Shape = {}'.format(shape), '\n\n', data.head(5))

Shape = (5572, 2) 

   Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
#Randomizing dataset
rand_df = data.sample(frac=1, random_state=1)

#splitting the dataframe into a training set (80%) and test set (20%)
eighty_pct = round(len(rand_df)*.8) 
twenty_pct = 1-eighty_pct
train_df = rand_df[:eighty_pct].reset_index(drop=True)
test_df = rand_df[eighty_pct:].reset_index(drop=True)


#pct of ham and spam in training dataframe
train_pct = train_df['Label'].value_counts(normalize=True)
#pct of ham and spam in test dataframe
test_pct = test_df['Label'].value_counts(normalize=True)


In [4]:
print('Precent of ham and spam SMS in training set \n{}'.format(train_pct*100))

Precent of ham and spam SMS in training set 
ham     86.54105
spam    13.45895
Name: Label, dtype: float64


In [5]:
print('Precent of ham and spam SMS in test set \n{}'.format(test_pct*100))

Precent of ham and spam SMS in test set 
ham     86.804309
spam    13.195691
Name: Label, dtype: float64


In [6]:
#Data Cleaning 
#Remove all punctuation from SMS column

regex_pat = re.compile(r'\W')
train_df['SMS']=train_df['SMS'].str.replace(regex_pat, ' ')
test_df['SMS'] = test_df['SMS'].str.replace(regex_pat, ' ')

#Lower case transformation of each message
train_df['SMS'] = train_df['SMS'].str.lower()
test_df ['SMS'] = test_df['SMS'].str.lower()


In [8]:
# Creating a vocabulary for the messages in training set, which is python list
# containing all unique words across all messages. 
# Each word is represented as a string
train_df['SMS'] = train_df['SMS'].str.split()
training_vocab = []
for i in train_df['SMS']:
    for x in i:
        training_vocab.append(x)
training_vocab=set(training_vocab)
training_vocab = list(training_vocab)


In [9]:
training_vocab[0:10]

['know',
 'jaykwon',
 '83222',
 'forgotten',
 'even',
 'sexual',
 'cm2',
 'garden',
 'geeeee',
 'sms']

In [86]:
# Getting the word count per SMS
word_counts_per_sms = {
    unique_word: [0]* len(train_df['SMS']) for unique_word in training_vocab
}
for index, sms in enumerate(train_df['SMS']):
    for word in sms:
#         if word in word_counts_per_sms:
        word_counts_per_sms[word][index] +=1
word_counts_per_sms = pd.DataFrame(word_counts_per_sms)
train_df_clean = pd.concat([train_df, word_counts_per_sms],axis=1) 

In [12]:
train_df_clean.head(5)

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [14]:
print(u'P(spam) = {} \nP(Ham) = {} \nNo. spam words = {} \nNo. ham words = {} \nNo. vocabulary words = {}'.format(p_spam, p_ham, n_spam, n_ham, n_vocab))

P(spam) = 0.13458950201884254 
P(Ham) = 0.8654104979811574 
No. spam words = 15190 
No. ham words = 57237 
No. vocabulary words = 7783


In [15]:
#dictionary where key value pair of a unique word 
# in the vocabulary is zero. This is probablity of each word
p_w_spam = {unique_word: 0 for unique_word in training_vocab}
p_w_ham = {unique_word: 0 for unique_word in training_vocab}


for i in training_vocab: 
    p_word_given_spam = train_spam[i].sum()
    p_word_given_spam = (p_word_given_spam + alpha)/(n_spam + alpha*n_vocab)
    p_w_spam[i] = p_word_given_spam

    # for ham 
    p_word_given_ham = train_ham[i].sum()
    p_word_given_ham = (p_word_given_ham +alpha)/(n_ham+alpha*n_vocab)
    p_w_ham[i] = p_word_given_ham
 
    


In [20]:
def classify(message):
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for i in message:
        if i in p_w_spam:
            p_spam_given_message *= p_w_spam[i]
        if i in p_w_ham:
            p_ham_given_message *= p_w_ham[i]
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal probabilities, have human classify this')
        
    print('P(Spam|Message) = {} \nP(Ham|Message) = {}'.format(p_spam_given_message, p_ham_given_message))
    
    
    
    

In [25]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

Label: Spam
P(Spam|Message) = 1.3481290211300841e-25 
P(Ham|Message) = 1.9368049028589875e-27


In [26]:
classify('Thank you John. I will be there')

Label: Ham
P(Spam|Message) = 2.386543025203764e-23 
P(Ham|Message) = 2.4747123467757343e-18


In [29]:
#How well does this spam filter works?
#first change the function to return only lables
def classify_test(message):
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for i in message:
        if i in p_w_spam:
            p_spam_given_message *= p_w_spam[i]
        if i in p_w_ham:
            p_ham_given_message *= p_w_ham[i]
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classifiers'
        

In [30]:
test_df.head(5)

Unnamed: 0,Label,SMS
0,ham,later i guess i needa do mcat study too
1,ham,but i haf enuff space got like 4 mb
2,spam,had your mobile 10 mths update to latest oran...
3,ham,all sounds good fingers makes it difficult ...
4,ham,all done all handed in don t know if mega sh...


In [32]:
#Apply the classify_test function to test dataframe 
test_df['predicted'] = test_df['SMS'].apply(classify_test)
test_df.head(5)

Unnamed: 0,Label,SMS,predicted
0,ham,later i guess i needa do mcat study too,ham
1,ham,but i haf enuff space got like 4 mb,ham
2,spam,had your mobile 10 mths update to latest oran...,spam
3,ham,all sounds good fingers makes it difficult ...,ham
4,ham,all done all handed in don t know if mega sh...,ham


In [85]:
#calculating accuracy by deviding the correct number of classified messages by total number of classified messages
correct = 0 
total = len(test_df['SMS'])

for i, r in test_df.iterrows():
    if r['Label'] == r['predicted']:
        correct +=1
spam_filter_accuracy = (correct/total)*100
print('Spam filter accuracy is {}%'.format(spam_filter_accuracy))

Spam filter accuracy is 98.74326750448833%
