# Building a Spam Filter with Naive Bayes

The goal of the project is to use Naive Bayes to filter the spam in the dataset provided by Tiago A. Almeida and Jose Maria Gomez. The algorithm is used for educational purposes.

In [1]:
import pandas as pd
import re

In [2]:
spam_dataset = pd.read_csv("SMSSpamCollection", delimiter = "\t", header = None, names=['Label','SMS'])

# Reading and Exploring the Dataset

In [3]:
spam_dataset.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
spam_dataset.count()

Label    5572
SMS      5572
dtype: int64

### Getting the percentage of Spam and Ham(Non-spam)

In [5]:
temp = spam_dataset.groupby("Label").count()
temp = (temp/temp.sum()) * 100
temp.round(2).head()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,86.59
spam,13.41


## Getting the Train and Test dataset
The plan:
1. Randomize the data set first
2. Get the index of the 80th(Length of dataset * .80)
3. Split the dataset by 80:20. Use the index

In [6]:
sample = spam_dataset.sample(frac = 1, random_state = 1)

In [7]:
sample

Unnamed: 0,Label,SMS
1078,ham,"Yep, by the pretty sculpture"
4028,ham,"Yes, princess. Are you going to make me moan?"
958,ham,Welp apparently he retired
4642,ham,Havent.
4674,ham,I forgot 2 ask ü all smth.. There's a card on ...
5461,ham,Ok i thk i got it. Then u wan me 2 come now or...
4210,ham,I want kfc its Tuesday. Only buy 2 meals ONLY ...
4216,ham,No dear i was sleeping :-P
1603,ham,Ok pa. Nothing problem:-)
1504,ham,Ill be there on &lt;#&gt; ok.


In [8]:
training_test_index = round(len(sample) * 0.8)

In [9]:
train_ds = sample[:training_test_index].reset_index(drop = True)
test_ds = sample[training_test_index :].reset_index(drop = True)

In [10]:
train_ds["Label"].value_counts(normalize = True)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

In [11]:
test_ds["Label"].value_counts(normalize = True)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

In [12]:
train_ds.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [13]:
def remove_punct(sms):
    return re.sub('\W',' ',sms)

In [14]:
train_ds["SMS"] = train_ds["SMS"].agg(remove_punct).str.lower()

# Converting every word to a column with counting values

## Making the Vocabulary for the Smoothing Parameter

In [15]:
train_ds["SMS"] = train_ds["SMS"].str.split()
test_ds["SMS"] = test_ds["SMS"].str.split()

In [16]:
vocabulary = []
def add_to_voc(sms):
    for word in sms:
        vocabulary.append(word)

train_ds["SMS"].agg(add_to_voc)
print("With duplicates: ")
print(len(vocabulary))
vocabulary = set(vocabulary)
vocabulary = list(vocabulary)
print("Without duplicates: ")
print(len(vocabulary))

With duplicates: 
72427
Without duplicates: 
7783


## We will use dictionary because the key will be the Column

In [17]:
word_counts_per_sms = {unique_word: [0] * len(train_ds['SMS']) for unique_word in vocabulary}

In [18]:
train_ds["SMS"].head()

0                    [yep, by, the, pretty, sculpture]
1    [yes, princess, are, you, going, to, make, me,...
2                      [welp, apparently, he, retired]
3                                             [havent]
4    [i, forgot, 2, ask, ü, all, smth, there, s, a,...
Name: SMS, dtype: object

In [19]:
for index, sms in enumerate(train_ds["SMS"]):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [20]:
word_count_df = pd.DataFrame(word_counts_per_sms)

In [24]:
training_set = pd.concat([train_ds, word_count_df], axis = 1)

In [40]:
p_nonspam = training_set[training_set["Label"] == "ham"]["SMS"].count()
p_nonspam = (p_nonspam/training_set["SMS"].count())*100
p_nonspam = p_nonspam.round(2)

p_spam = training_set[training_set["Label"] != "ham"]["SMS"].count()
p_spam = (p_spam/training_set["SMS"].count())*100
p_spam = p_spam.round(2)

n_voc = len(vocabulary)

alpha = 1

In [49]:
print("Probability Non-Spam: %",p_nonspam)
print("Probability Spam: %",p_spam,)
print("Number of words in Vocabulary: ",n_voc)

Probability Non-Spam: % 86.54
Probability Spam: % 13.46
Number of words in Vocabulary:  7783


In [56]:
spam_counter = {word : 0 for word in vocabulary}
nonspam_counter = {word : 0 for word in vocabulary}

In [58]:
spam_df = training_set[training_set["Label"] != "ham"]
nonspam_df = training_set[training_set["Label"] == "ham"]