In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import string
import re

In [2]:
np.random.seed(777)

In [3]:
df = pd.read_csv('../data/sms_data_uci.csv', encoding='latin')
df = df[['v1', 'v2']]
df.columns = ['Label', 'Message']

In [4]:
df.shape

(5572, 2)

In [5]:
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
cv = CountVectorizer(stop_words='english', max_features=500)

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [8]:
train_X = cv.fit_transform(df_train['Message']).toarray()

In [9]:
test_X = cv.transform(df_test['Message']).toarray()

In [10]:
train_X.shape, test_X.shape

((4457, 500), (1115, 500))

## custom MultinomialNB

In [11]:
train_X.shape

(4457, 500)

In [12]:
tf_spam = dict()
tf_ham = dict()
spam_word_count = 0
ham_word_count = 0
spam_count = 0
ham_count = 0

# default count as 1
for word_id in range(500):
    tf_spam[word_id] = 0
    tf_ham[word_id] = 0

for d_id, row  in enumerate(zip(train_X, df_train['Label'])):
    label = row[1]
    if label == 'spam':
        spam_count += 1
    else:
        ham_count += 1
    for word_id, count in enumerate(row[0]):
        if label == 'spam':
            if count:
                tf_spam[word_id] = tf_spam.get(word_id, 0) + count
                spam_word_count += count
        else:
            if count:
                tf_ham[word_id] = tf_ham.get(word_id, 0) + count
                ham_word_count += count

In [13]:
prob_spam = spam_count/(spam_count + ham_count)
prob_ham = ham_count/(spam_count + ham_count)

In [14]:
tf_spam_prob = dict()
for word_id in tf_spam:
    tf_spam_prob[word_id] = np.log(tf_spam[word_id] + 1) - np.log(spam_word_count + 500)

In [15]:
tf_ham_prob = dict()
for word_id in tf_ham:
    tf_ham_prob[word_id] = np.log(tf_ham[word_id] + 1) - np.log(ham_word_count + 500)

In [16]:
prob_spam, prob_ham

(0.13551716401166705, 0.864482835988333)

In [17]:
def predict(messages):
    result = []
    for msg in messages:
        spam_prob = 0
        ham_prob = 0
        for word_id, count in enumerate(msg):
            for _ in range(count):
                spam_prob += tf_spam_prob[word_id]
                ham_prob += tf_ham_prob[word_id]
        spam_prob += np.log(prob_spam)
        ham_prob += np.log(prob_ham)
        if spam_prob > ham_prob:
            result.append(1)
        else:
            result.append(0)
    return result

In [18]:
res = predict(test_X)

In [19]:
accuracy_score(res, df_test['Label'].map({'ham': 0, 'spam': 1}))

0.9802690582959641

## Comparing it with sklearn MultinomialNB

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
clf = MultinomialNB()

In [22]:
clf.fit(train_X, df_train['Label'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
clf.score(test_X, df_test['Label'])

0.9802690582959641

## Compare Priors

### class probability

In [24]:
clf.class_log_prior_

array([-0.14562383, -1.99865698])

In [25]:
np.log(prob_ham), np.log(prob_spam)

(-0.14562382832039822, -1.9986569750117409)

### word_id ham prior probability

In [26]:
for i in range(10):
    print(tf_ham_prob[i])

-9.650271316965625
-7.165364667177625
-8.95712413640568
-9.650271316965625
-9.650271316965625
-9.650271316965625
-9.650271316965625
-9.650271316965625
-9.650271316965625
-8.95712413640568


In [27]:
clf.feature_log_prob_[0,:][0:10]

array([-9.65027132, -7.16536467, -8.95712414, -9.65027132, -9.65027132,
       -9.65027132, -9.65027132, -9.65027132, -9.65027132, -8.95712414])

### word_id spam prior probability

In [28]:
for i in range(10):
    print(tf_spam_prob[i])

-5.549960023784207
-5.422126652274322
-5.239805095480367
-5.08565441565311
-5.750630719246359
-5.807789133086307
-5.807789133086307
-4.6156507864073735
-5.345165611138194
-4.749182179031896


In [29]:
clf.feature_log_prob_[1,:][0:10]

array([-5.54996002, -5.42212665, -5.2398051 , -5.08565442, -5.75063072,
       -5.80778913, -5.80778913, -4.61565079, -5.34516561, -4.74918218])