# Building a Spam Filter with Naive Bayes

The goal of the project is to use Naive Bayes to filter the spam in the dataset provided by Tiago A. Almeida and Jose Maria Gomez. The algorithm is used for educational purposes.

In [1]:
import pandas as pd
import re

In [2]:
spam_dataset = pd.read_csv("SMSSpamCollection", delimiter = "\t", header = None, names=['Label','SMS'])

# 1. Reading and Exploring the Dataset

In [3]:
spam_dataset.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
spam_dataset.count()

Label    5572
SMS      5572
dtype: int64

### Getting the percentage of Spam and Ham(Non-spam)

In [5]:
temp = spam_dataset.groupby("Label").count()
temp = (temp/temp.sum()) * 100
temp.round(2).head()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,86.59
spam,13.41


# 2. Getting the Train and Test dataset
The plan:
1. Randomize the data set first
2. Get the index of the 80th(Length of dataset * .80)
3. Split the dataset by 80:20. Use the index

In [6]:
sample = spam_dataset.sample(frac = 1, random_state = 1)

training_test_index = round(len(sample) * 0.8)

train_ds = sample[:training_test_index].reset_index(drop = True)
test_ds = sample[training_test_index :].reset_index(drop = True)

print(train_ds["Label"].value_counts(normalize = True))
print(test_ds["Label"].value_counts(normalize = True))

ham     0.86541
spam    0.13459
Name: Label, dtype: float64
ham     0.868043
spam    0.131957
Name: Label, dtype: float64


# 3. Cleaning the Data

In [7]:
def remove_punct(sms):
    return re.sub('\W',' ',sms)

In [8]:
train_ds["SMS"] = train_ds["SMS"].agg(remove_punct).str.lower()

# 4. Cleaning the data pt.2

In [9]:
#Changing the message to list of words each
train_ds["SMS"] = train_ds["SMS"].str.split()

In [10]:
vocabulary = []
def add_to_voc(sms):
    for word in sms:
        vocabulary.append(word)

train_ds["SMS"].agg(add_to_voc)
print("With duplicates: ")
print(len(vocabulary))
vocabulary = set(vocabulary)
vocabulary = list(vocabulary)
print("Without duplicates: ")
print(len(vocabulary))

With duplicates: 
72427
Without duplicates: 
7783


# 5. Converting the words in vocabulary to per column in the train dataset

In [11]:
word_counts_per_sms = {unique_word: [0] * len(train_ds['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(train_ds["SMS"]):
    for word in sms:
        word_counts_per_sms[word][index] += 1
        
word_count_df = pd.DataFrame(word_counts_per_sms)
training_set = pd.concat([train_ds, word_count_df], axis = 1)

# 6. Calculating the relevant parameters

In [12]:
spam = training_set[training_set["Label"] != "ham"]
nonspam = training_set[training_set["Label"] == "ham"]

#Calculating P(NONSPAM)
p_nonspam = (nonspam["SMS"].count()/training_set["SMS"].count())

#Calculating P(SPAM)
p_spam = (spam["SMS"].count()/training_set["SMS"].count())

#Calculating Nspam(Number of words in spam)
n_spam = spam["SMS"].apply(len).sum()

#Calculating Nnonspam(Number of words in nonspam)
n_nonspam = nonspam["SMS"].apply(len).sum()

#Calculating total number of unique words
n_voc = len(vocabulary)

alpha = 1

In [13]:
#print("Probability Non-Spam: %",round(p_nonspam["SMS"] *100,2))
#print("Probability Spam: %",round(p_spam["SMS"]*100,2))
#print("Number of words in Vocabulary: ",n_voc)

# 7. Calculating P(w|Spam) and P(w|nonSpam)

In [14]:
spam_word_probability = {word : 0 for word in vocabulary}
nonspam_word_probability = {word : 0 for word in vocabulary}

spam_messages = training_set[training_set["Label"] != "ham"]
nonspam_messages = training_set[training_set["Label"] == "ham"]

spam_counter = {word : 0 for word in vocabulary}

for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha)/(n_spam + (alpha*n_voc))
    spam_word_probability[word] = p_word_given_spam
    
    n_word_given_nonspam = nonspam_messages[word].sum()
    p_word_given_nonspam = (n_word_given_nonspam + alpha)/(n_nonspam + (alpha*n_voc))
    nonspam_word_probability[word] = p_word_given_nonspam

*INSERT IMAGE OF FORMULA FOR P(w|Spam) and (Pw|NonSpam)*


Nwi|Spam is equal to the number of times the word w occurs in all the spam messages

Nwi|Ham is equal to the number of times the word w occurs in all the ham messages.

# 8. Creating a function for Spam Filtering

In [15]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_nonspam
    
    spam_voc = list(spam_word_probability.keys())
    nonspam_voc = list(nonspam_word_probability.keys())
    
    for word in message:
        if word in spam_voc:
            p_spam_given_message *= spam_word_probability[word]
        if word in nonspam_voc:
            p_ham_given_message *= nonspam_word_probability[word]
        else:
            pass
        
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [16]:
classify('WINNER!! This is the secret code to unlock the money: C3421.'
)

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [17]:
classify("Sounds good, Tom, then see u there"
)

P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


In [18]:
test_ds.head()

Unnamed: 0,Label,SMS
0,ham,Later i guess. I needa do mcat study too.
1,ham,But i haf enuff space got like 4 mb...
2,spam,Had your mobile 10 mths? Update to latest Oran...
3,ham,All sounds good. Fingers . Makes it difficult ...
4,ham,"All done, all handed in. Don't know if mega sh..."


# 9. Testing the model to the Test Data Set

In [24]:

def classify_test(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_nonspam
    
    spam_voc = list(spam_word_probability.keys())
    nonspam_voc = list(nonspam_word_probability.keys())
    
    for word in message:
        if word in spam_voc:
            p_spam_given_message *= spam_word_probability[word]
        if word in nonspam_voc:
            p_ham_given_message *= nonspam_word_probability[word]
        else:
            pass
       
    if p_ham_given_message > p_spam_given_message:
        return "ham"
    elif p_ham_given_message < p_spam_given_message:
        return "spam"
    else:
        return 'have a human classify this!'

In [25]:
test_ds["predicted"] = test_ds["SMS"].agg(classify_test)
test_ds.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [36]:
total = test_ds["SMS"].count()
correct = test_ds[(test_ds["Label"]) == (test_ds["predicted"])]["SMS"].count()
accuracy = correct/total
accuracy

In [40]:
print(accuracy*100)

98.74326750448833


The model we built from scratch gave us a 98% accuracy. This indicates that we have built a good 