In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import string
import re

In [2]:
np.random.seed(777)

In [3]:
df = pd.read_csv('../data/sms_data_uci.csv', encoding='latin')
df = df[['v1', 'v2']]
df.columns = ['Label', 'Message']

In [4]:
df.shape

(5572, 2)

In [5]:
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
cv = CountVectorizer(stop_words='english', max_features=500)

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [8]:
train_X = cv.fit_transform(df_train['Message']).toarray()

In [9]:
test_X = cv.transform(df_test['Message']).toarray()

In [10]:
train_X.shape, test_X.shape

((4457, 500), (1115, 500))

## custom BernoulliNB

In [11]:
tf_spam = dict()
tf_ham = dict()

spam_word_count = 0
ham_word_count = 0

spam_count = 0
ham_count = 0

for word_id in range(500):
    tf_spam[word_id] = 0
    tf_ham[word_id] = 0

for d_id, row  in enumerate(zip(train_X, df_train['Label'])):
    label = row[1]
    if label == 'spam':
        spam_count += 1
    else:
        ham_count += 1
    for word_id, count in enumerate(row[0]):
        if count:
            if label == 'spam':
                tf_spam[word_id] = tf_spam.get(word_id, 0) + 1
                spam_word_count += 1
            else:
                tf_ham[word_id] = tf_ham.get(word_id, 0) + 1
                ham_word_count += 1

In [12]:
prob_spam = np.log(spam_count) - np.log(spam_count + ham_count)
prob_ham = np.log(ham_count) - np.log(spam_count + ham_count)

In [13]:
prob_spam, prob_ham

(-1.9986569750117402, -0.14562382832039766)

In [14]:
tf_spam_prob = dict()
for word_id in tf_spam:
    tf_spam_prob[word_id] = np.log(tf_spam[word_id] + 1) - np.log(spam_count + 2) 

In [15]:
tf_ham_prob = dict()
for word_id in tf_ham:
    tf_ham_prob[word_id] = np.log(tf_ham[word_id] + 1) - np.log(ham_count + 2)

In [16]:
def predict(messages):
    """
    source: https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
    """
    result = []
    for msg in messages:
        spam_prob = 0
        ham_prob = 0
        for word_id, count in enumerate(msg):
            if count:
                spam_prob += tf_spam_prob[word_id]
                ham_prob += tf_ham_prob[word_id]
            else:
                spam_prob += np.log(1 - np.exp(tf_spam_prob[word_id]))
                ham_prob += np.log(1 - np.exp(tf_ham_prob[word_id]))
        spam_prob += prob_spam
        ham_prob += prob_ham
        if spam_prob > ham_prob:
            result.append(1)
        else:
            result.append(0)
    return result

In [17]:
res_2 = predict(test_X)

In [18]:
accuracy_score(res_2, df_test['Label'].map({'ham': 0, 'spam': 1}))

0.9811659192825112

## Comparing it with sklearn BernoulliNB

In [19]:
from sklearn.naive_bayes import BernoulliNB

In [20]:
clf = BernoulliNB()

In [21]:
clf.fit(train_X, df_train['Label'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [22]:
accuracy_score(clf.predict(test_X), df_test['Label'])

0.9811659192825112

## Compare Priors

### class probability

In [23]:
clf.class_log_prior_

array([-0.14562383, -1.99865698])

In [24]:
prob_ham, prob_spam

(-0.14562382832039766, -1.9986569750117402)

### word_id ham prior probability

In [25]:
for i in range(10):
    print(tf_ham_prob[i])

-8.25712628599743
-5.859231013199059
-7.563979105437484
-8.25712628599743
-8.25712628599743
-8.25712628599743
-8.25712628599743
-8.25712628599743
-8.25712628599743
-7.563979105437484


In [26]:
clf.feature_log_prob_[0,:][0:10]

array([-8.25712629, -5.85923101, -7.56397911, -8.25712629, -8.25712629,
       -8.25712629, -8.25712629, -8.25712629, -8.25712629, -7.56397911])

### word_id spam prior probability

In [27]:
for i in range(10):
    print(tf_spam_prob[i])

-3.4111477125153233
-3.2288261557213684
-3.0056826044071587
-2.8515319245799007
-3.5165082281731497
-3.573666642013098
-3.573666642013098
-2.3815282953341645
-3.111043120064985
-2.5150596879586877


In [28]:
clf.feature_log_prob_[1,:][0:10]

array([-3.41114771, -3.22882616, -3.0056826 , -2.85153192, -3.51650823,
       -3.57366664, -3.57366664, -2.3815283 , -3.11104312, -2.51505969])