# Spam Detection using Naive Bayes 

In [1]:
import numpy as np 

## Collecting the Dataset 

In [1]:
spam = [
    "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 
    "WINNER!! As a valued network customer you have been selected to receivea $900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.", 
    "URGENT! You have won a 1 week FREE membership in our $100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18.", 
    "Please call our new customer service representative on 0800 169 6031 you WON a prize", 
    "We are trying to contact you. Last weekends customer draw shows that you won a £1000 prize GUARANTEED. Calling years", 
]

In [2]:
# leave one sentence from spam for testing our model later 
spam_test = ["Please call our new customer service representative on 0800 169 6031 you WON a prize"]


In [3]:
non = [
    "I don't think he goes to usf, he lives around here though", 
    "New car and house for my parents. i have only new job in hand", 
    "Great escape. I fancy the bridge but needs her lager. See you tomorrow", 
    "Tired. I haven't slept well the past few nights.",
    "Too late. I said i have the website. I didn't i have or dont have the slippers", 
    "I might come by tonight then if my class lets out early", 
    "Jos ask if u wana meet up?", 
    "That would be great. We'll be at the Guild. We can try meeting with the customer on Bristol road or somewhere"
    ]

In [4]:
# another sentence from non for testing our model 
spam_test_2 = ["That would be great. We'll be at the Guild. We can try meeting with the customer on Bristol road or somewhere"]

## Basic Pre-Processing

In [None]:
# !pip install gensim

In [5]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer
from gensim.utils import tokenize

In [6]:
#test_sentence = non[4]
test_sentence = non[5]


print(test_sentence)

removed_stops = remove_stopwords(test_sentence)
print(removed_stops)

p = PorterStemmer()
stemmed = p.stem(removed_stops)
print(stemmed)

tokens = tokenize(stemmed)
print(list(tokens))

I might come by tonight then if my class lets out early
I come tonight class lets early
i come tonight class lets earli
['i', 'come', 'tonight', 'class', 'lets', 'earli']


## Create a dictionary of words 

In [7]:
def tokenize_sentence(sentence): 
    p = PorterStemmer()
    removed_stops = remove_stopwords(sentence)
    stemmed = p.stem(removed_stops)
    tokens = tokenize(stemmed)
    return list(tokens)

In [8]:
dictionary = set()     # will have unique values only 
spams_tokenized = [] 
nons_tokenized = [] 


for sentence in spam:      
    sentence_tokens = tokenize_sentence(sentence)
    spams_tokenized.append(sentence_tokens)
    dictionary  = dictionary.union(sentence_tokens)   # add sentence words to the dictionary  
    
    

for sentence in non:      
    sentence_tokens = tokenize_sentence(sentence)
    nons_tokenized.append(sentence_tokens)
    dictionary  = dictionary.union(sentence_tokens)


    
print("Tokenized spam: ", spams_tokenized)
print("Tokenized non:  ", nons_tokenized)
print("Dictionary:     ", dictionary)

Tokenized spam:  [['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'st', 'may', 'text', 'fa', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 't', 'c', 's', 'apply', 'over'], ['winner', 'as', 'valued', 'network', 'customer', 'selected', 'receivea', 'prize', 'reward', 'to', 'claim', 'claim', 'code', 'kl', 'valid', 'hours', 'only'], ['urgent', 'you', 'won', 'week', 'free', 'membership', 'prize', 'jackpot', 'txt', 'word', 'claim', 'no', 't', 'c', 'www', 'dbuk', 'net', 'lccltd', 'pobox', 'ldnw', 'a', 'rw'], ['please', 'new', 'customer', 'service', 'representative', 'won', 'pr'], ['we', 'trying', 'contact', 'you', 'last', 'weekends', 'customer', 'draw', 'shows', 'won', 'prize', 'guaranteed', 'calling', 'year']]
Tokenized non:   [['i', 'don', 't', 'think', 'goes', 'usf', 'l'], ['new', 'car', 'house', 'parents', 'new', 'job', 'hand'], ['great', 'escape', 'i', 'fancy', 'bridge', 'needs', 'lager', 'see', 'tomorrow'], ['tired', 'i', 'haven', 't', 'slept', 'past', 'nig

## Basic Stats 

In [10]:
# These things do not depend on an individual word so let's calculate them separately once 

total_word_count = len(dictionary)
total_spam_messages = len(spams_tokenized) 
total_all_messages = len(spams_tokenized) + len(nons_tokenized)

print("Total Number of words: ", total_word_count)

Total Number of words:  118


In [11]:
# P(spam) does not depend on an individual word so let's calculate that separately once 

p_spam = total_spam_messages / total_all_messages

print("P(spam) = ", p_spam)

P(spam) =  0.38461538461538464


In [12]:
# Helper function to count occurances 

def count_word_in_messages(word, messages): 
    total_count = 0
    for msg in messages: 
        if word in msg:       # this ensured uniqueness automatically  
            total_count += 1 
            
    return total_count 

## The Actual Probability Computation 

In [13]:
final_prob = 1   # can't start from 0 


for test_sentence in spam_test: 
    test_sentence = tokenize_sentence(test_sentence)
    print(test_sentence)
    
    # let's run this for each word separately 
    for word in test_sentence: 
        print("----------------")
        print("Runnig for word:", word)
        
        # Find P( w | spam)
        spam_count = count_word_in_messages(word, spams_tokenized)
        p_w_spam = spam_count / total_spam_messages 
        print("P( w | spam)  = ", p_w_spam)
        
        # Find P( w )
        w_count = count_word_in_messages(word, spams_tokenized)
        w_count += count_word_in_messages(word, nons_tokenized)
        p_w = w_count / total_all_messages
        print("P( w )        = ", p_w)
        
        
        # Find P( spam | w )
        p_spam_w = (p_w_spam * p_spam) / p_w
        print("P( spam )     = ", p_spam)
        print("P( spam | w ) = ", p_spam_w)
        print("")
        final_prob *= p_spam_w
        
        
    print("P( spam | all_words ) = ", final_prob)

['please', 'new', 'customer', 'service', 'representative', 'won', 'pr']
----------------
Runnig for word: please
P( w | spam)  =  0.2
P( w )        =  0.07692307692307693
P( spam )     =  0.38461538461538464
P( spam | w ) =  1.0

----------------
Runnig for word: new
P( w | spam)  =  0.2
P( w )        =  0.15384615384615385
P( spam )     =  0.38461538461538464
P( spam | w ) =  0.5

----------------
Runnig for word: customer
P( w | spam)  =  0.6
P( w )        =  0.3076923076923077
P( spam )     =  0.38461538461538464
P( spam | w ) =  0.75

----------------
Runnig for word: service
P( w | spam)  =  0.2
P( w )        =  0.07692307692307693
P( spam )     =  0.38461538461538464
P( spam | w ) =  1.0

----------------
Runnig for word: representative
P( w | spam)  =  0.2
P( w )        =  0.07692307692307693
P( spam )     =  0.38461538461538464
P( spam | w ) =  1.0

----------------
Runnig for word: won
P( w | spam)  =  0.6
P( w )        =  0.23076923076923078
P( spam )     =  0.384615384615384