In [1]:
import re
import pandas as pd
from random import shuffle
from math import exp, log
from collections import defaultdict, Counter
from typing import NamedTuple, List, Set, Tuple

In [2]:
class Message(NamedTuple):
    text: str
    is_spam: bool

In [3]:
csv_path = "spam.csv"
dataFrame = pd.read_csv(csv_path, encoding = "latin-1",)[["label", "message"]]
dataFrame.rename(columns = {"label": "is_spam", "message": "text"}, inplace = True)

dataFrame["is_spam"] =  dataFrame["is_spam"].map({"spam": True, "ham": False}) #Converting spam/ham to boolean
messages: List[Message] = [Message(text = row["text"], is_spam = row["is_spam"]) for _, row in dataFrame.iterrows()]

In [4]:
shuffle(messages)
len(messages)

5572

In [5]:
def token(text: str) -> Set[str]:
    words: List[str] = []
    for word in re.findall(r"[A-Za-z0-9\']+", text):
        if len(word) >= 2:
            words.append(word.lower())
    return set(words)

In [6]:
assert token("This is a text that will be tokenized, what if it doesn't get tokenized?") == {'this','is','text', 'that', 'will', 'be', 'tokenized', 'what', 'if', 'it', "doesn't", 'get'}

In [7]:
token(messages[0].text)

{'and',
 'around',
 'be',
 'constantly',
 'do',
 'doing',
 "don't",
 'ear',
 'go',
 'going',
 "i'm",
 'in',
 'is',
 "it's",
 'julianaland',
 'know',
 'listen',
 'mad',
 'me',
 'not',
 'oblivious',
 'off',
 'on',
 'one',
 'other',
 'out',
 'problem',
 'same',
 'say',
 'surprised',
 'tell',
 'that',
 'the',
 'then',
 'they',
 'things',
 'to',
 'upset',
 'walk',
 'want',
 'what',
 'whatever',
 'when',
 'while',
 'why',
 'you'}

In [8]:
def dataset_split(messages: List[Message], pct = 0.8) -> Tuple[List[Message], List[Message]]:
    shuffle(messages)
    train = int(round(len(messages) * pct, 0))
    return messages[:train], messages[train:]

In [9]:
assert len(dataset_split(messages)[0]) + len(dataset_split(messages)[1]) == len(messages)

In [10]:
class NaiveBayes:
    def __init__(self, k = 1) -> None:
        self._k: int = k
        self._num_spam_messages: int = 0
        self._num_ham_messages: int = 0
        self._num_words_spam: Dict[str, int] = defaultdict(int)
        self._num_words_ham: Dict[str, int] = defaultdict(int)
        self._spam_words: Set[str] = set()
        self._ham_words: Set[str] = set()
        self._words: Set[str] = set()

    def train(self, messages: List[Message]) -> None:
        message: Message
        tok: str
        for message in messages:
            tokens: Set[str] = token(message.text)
            self._words.update(tokens)
            if message.is_spam:
                self._num_spam_messages += 1
                self._spam_words.update(tokens)
                for tok in tokens:
                    self._num_words_spam[tok] += 1

            else:
                self._num_ham_messages += 1
                self._ham_words.update(tokens)
                for tok in tokens:
                    self._num_words_ham[tok] += 1


    def _spam(self, word: str) -> float:
        return (self._k + self._num_words_spam[word]) / ((2* self._k) + (self._num_spam_messages))

    def _ham(self, word: str) -> float:
        return (self._k + self._num_words_ham[word]) / ((2* self._k) + (self._num_ham_messages))

    def predict(self, text: str) -> float:
        text_words: Set[str] = token(text)
        log_spam: float = 0.0
        log_ham: float = 0.0
        epsilon = 1e-10

        for word in self._words:
            word_spam: float = self._spam(word)
            word_ham: float = self._ham(word)
            if word in text_words:
                log_spam += log(max(word_spam, epsilon))
                log_ham += log(max(word_ham, epsilon))
            else:
                log_spam += log(max(1 - word_spam, epsilon))
                log_ham += log(max(1 - word_ham, epsilon))

        word_if_spam: float = exp(log_spam)
        word_if_ham: float = exp(log_ham)
        return word_if_spam / (word_if_spam + word_if_ham)
        
    

In [11]:
def test_naive_bayes():
    messages: List[Message] = [
        Message('Spam message', is_spam=True),
        Message('Ham message', is_spam=False),
        Message('Ham message about Spam', is_spam=False)]
    
    nb: NaiveBayes = NaiveBayes()
    nb.train(messages)
    
    assert nb._num_spam_messages == 1
    assert nb._num_ham_messages == 2
    assert nb._spam_words == {'spam', 'message'}
    assert nb._ham_words == {'ham', 'message', 'about', 'spam'}
    assert nb._num_words_spam == {'spam': 1, 'message': 1}
    assert nb._num_words_ham == {'ham': 2, 'message': 2, 'about': 1, 'spam': 1}
    assert nb._words == {'spam', 'message', 'ham', 'about'}

    
    text: str = 'A spam message'
    
    
    word_if_spam: float = exp(sum([
        log(     (1 + 1) / ((2 * 1) + 1)),  
        log(     (1 + 1) / ((2 * 1) + 1)),  
        log(1 - ((1 + 0) / ((2 * 1) + 1))), 
        log(1 - ((1 + 0) / ((2 * 1) + 1))), 
    ]))
    
   
    word_if_ham: float = exp(sum([
        log(     (1 + 1)  / ((2 * 1) + 2)),  
        log(     (1 + 2)  / ((2 * 1) + 2)),  
        log(1 - ((1 + 2)  / ((2 * 1) + 2))), 
        log(1 - ((1 + 1)  / ((2 * 1) + 2))),
    ]))
    
    word_spam: float = word_if_spam / (word_if_spam + word_if_ham)
    
    assert word_spam == nb.predict(text)

test_naive_bayes()

In [12]:
train: List[Message]
test: List[Message]

train, test = dataset_split(messages)

In [13]:
nb: NaiveBayes = NaiveBayes()
nb.train(train)

print(f'Spam messages in training data: {nb._num_spam_messages}')
print(f'Ham messages in training data: {nb._num_ham_messages}')
print(f'Most spammy words: {Counter(nb._num_words_spam).most_common(25)}')

Spam messages in training data: 589
Ham messages in training data: 3869
Most spammy words: [('to', 357), ('call', 275), ('your', 192), ('you', 191), ('now', 146), ('or', 139), ('for', 138), ('free', 136), ('the', 119), ('is', 112), ('txt', 108), ('from', 105), ('have', 98), ('mobile', 94), ('on', 94), ('with', 90), ('claim', 85), ('and', 85), ('text', 84), ('ur', 80), ('stop', 79), ('www', 76), ('reply', 75), ('of', 68), ('only', 68)]


In [14]:
spam_messages: List[Message] = [item for item in test if item.is_spam]
spam_messages[:5]

[Message(text='Sunshine Quiz Wkly Q! Win a top Sony DVD player if u know which country the Algarve is in? Txt ansr to 82277. ï¿½1.50 SP:Tyrone', is_spam=True),
 Message(text='Get your garden ready for summer with a FREE selection of summer bulbs and seeds worth ï¿½33:50 only with The Scotsman this Saturday. To stop go2 notxt.co.uk', is_spam=True),
 Message(text="Free entry in 2 a weekly comp for a chance to win an ipod. Txt POD to 80182 to get entry (std txt rate) T&C's apply 08452810073 for details 18+", is_spam=True),
 Message(text='We tried to contact you re your reply to our offer of a Video Phone 750 anytime any network mins Half Price Line Rental Camcorder Reply or call 08000930705', is_spam=True),
 Message(text="UpgrdCentre Orange customer, you may now claim your FREE CAMERA PHONE upgrade for your loyalty. Call now on 0207 153 9153. Offer ends 26th July. T&C's apply. Opt-out available", is_spam=True)]

In [21]:
message: str = spam_messages[5].text
    
print(f'Predicting likelihood of "{message}" being spam.')
nb.predict(message)

Predicting likelihood of "network operator. The service is free. For T & C's visit 80488.biz" being spam.


0.9997157312280469

In [16]:
ham_messages: List[Message] = [item for item in test if not item.is_spam]
ham_messages[:5]

[Message(text='And do you have any one that can teach me how to ship cars.', is_spam=False),
 Message(text='S.i think he is waste for rr..', is_spam=False),
 Message(text='Daddy will take good care of you :)', is_spam=False),
 Message(text='Can you do online transaction?', is_spam=False),
 Message(text='Hmmm:)how many players selected?', is_spam=False)]

In [20]:
message: str = ham_messages[5].text

print(f'Predicting likelihood of "{message}" being spam.')
nb.predict(message)

Predicting likelihood of "Probably gonna swing by in a wee bit" being spam.


2.9994202226382373e-12