In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import string
import re

In [2]:
np.random.seed(777)

In [3]:
df = pd.read_csv('../data/sms_data_uci.csv', encoding='latin')
df = df[['v1', 'v2']]
df.columns = ['Label', 'Message']

In [4]:
df.shape

(5572, 2)

In [5]:
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
cv = CountVectorizer(stop_words='english', max_features=500)

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [8]:
train_X = cv.fit_transform(df_train['Message']).toarray()

In [9]:
test_X = cv.transform(df_test['Message']).toarray()

In [10]:
train_X.shape, test_X.shape

((4457, 500), (1115, 500))

## custom BernoulliNB

In [11]:
tf_spam = dict()
tf_ham = dict()

spam_word_count = 1
ham_word_count = 1

spam_count = 2
ham_count = 2

for word_id in range(500):
    tf_spam[word_id] = 1
    tf_ham[word_id] = 1

for d_id, row  in enumerate(zip(train_X, df_train['Label'])):
    label = row[1]
    if label == 'spam':
        spam_count += 1
    else:
        ham_count += 1
    for word_id, count in enumerate(row[0]):
        if count:
            if label == 'spam':
                tf_spam[word_id] = tf_spam.get(word_id, 0) + 1
                spam_word_count += 1
            else:
                tf_ham[word_id] = tf_ham.get(word_id, 0) + 1
                ham_word_count += 1

In [12]:
prob_spam = spam_count/(spam_count + ham_count)
prob_ham = ham_count/(spam_count + ham_count)

In [13]:
tf_spam_prob = dict()
for word_id in tf_spam:
    tf_spam_prob[word_id] = tf_spam[word_id] / spam_count

In [14]:
tf_ham_prob = dict()
for word_id in tf_ham:
    tf_ham_prob[word_id] = tf_ham[word_id] / ham_count

In [15]:
prob_spam, prob_ham

(0.13584398117014124, 0.8641560188298588)

In [16]:
def predict(messages):
    result = []
    for msg in messages:
        spam_prob = 0
        ham_prob = 0
        for word_id, count in enumerate(msg):
            if count:
                spam_prob += np.log(tf_spam_prob[word_id])
                ham_prob += np.log(tf_ham_prob[word_id])
        spam_prob += np.log(prob_spam)
        ham_prob += np.log(prob_ham)
        if spam_prob > ham_prob:
            result.append(1)
        else:
            result.append(0)
    return result

In [17]:
res = predict(test_X)

In [18]:
accuracy_score(res, df_test['Label'].map({'ham': 0, 'spam': 1}))

0.9452914798206278

In [19]:
def predict_like_sklearn(messages):
    """
    source: https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
    """
    result = []
    for msg in messages:
        spam_prob = 0
        ham_prob = 0
        for word_id, count in enumerate(msg):
            if count:
                spam_prob += np.log(tf_spam_prob[word_id])
                ham_prob += np.log(tf_ham_prob[word_id])
            else:
                spam_prob += np.log(1 - tf_spam_prob[word_id])
                ham_prob += np.log(1 - tf_ham_prob[word_id])
        spam_prob += np.log(prob_spam)
        ham_prob += np.log(prob_ham)
        if spam_prob > ham_prob:
            result.append(1)
        else:
            result.append(0)
    return result

In [20]:
res_2 = predict_new(test_X)

NameError: name 'predict_new' is not defined

In [None]:
accuracy_score(res_2, df_test['Label'].map({'ham': 0, 'spam': 1}))

## Comparing it with sklearn BernoulliNB

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
clf = BernoulliNB()

In [None]:
clf.fit(train_X, df_train['Label'])

In [None]:
accuracy_score(clf.predict(test_X), df_test['Label'])

## Compare Priors

### class probability

In [None]:
clf.class_log_prior_

In [None]:
np.log(prob_ham), np.log(prob_spam)

### word_id ham prior probability

In [None]:
for i in range(10):
    print(np.log(tf_ham_prob[i]))

In [None]:
clf.feature_log_prob_[0,:][0:10]

### word_id spam prior probability

In [None]:
for i in range(10):
    print(np.log(tf_spam_prob[i]))

In [None]:
clf.feature_log_prob_[1,:][0:10]