# Notebook imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Constant

In [2]:
SPAM = 1
NON_SPAM = 0

DATA_PATH = 'email-data.json'
TRAINING_DATA_FILE = 'data-grouped/train-data.txt'
TEST_DATA_FILE = 'data-grouped/test-data.txt'

TOKEN_SPAM_PROB_FILE = 'token-data/prob-spam.txt'
TOKEN_NON_SPAM_PROB_FILE = 'token-data/prob-non-spam.txt'
TOKEN_ALL_PROB_FILE = 'token-data/prob-all-token.txt'

TEST_FEATURE_MATRIX = 'test-features.txt'
TEST_TARGET_FILE = 'test-target.txt'

VOCAB_FILE = 'spam_word.csv'
VOCAB_SIZE = 2500

# Load the Data

In [27]:
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=' ')
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter=' ')
# Token probability
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_non_spam = np.loadtxt(TOKEN_NON_SPAM_PROB_FILE, delimiter=' ')
prob_all_token = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=' ')
print(prob_token_spam.shape)
print(prob_token_non_spam.shape)
print(prob_all_token.shape)

(2500,)
(2500,)
(2500,)


# Calculate the joint Probability

In [9]:
# dot product >> row * col
print('X_test shape :', X_test.shape)  # 1724 row with 2500 column
# (1724)*2500  dot 2500*(1) >> (1724, 1)
print('shape of the dot product is :', X_test.dot(prob_token_spam).shape)

X_test shape : (1724, 2500)
shape of the dot product is : (1724,)


## Set the Prior
$$ P(Spam \, | \, X) = \frac{P(X \, | \, Spam \,) \, P(Spam)}{P(X)} $$

we have to change it to

$$ \log(P(Spam \, | \, X)) = [\log(P(X \, | \, Spam) \, - \log(P(X))] + \log(P(Spam))$$

In [16]:
PROB_SPAM = 0.3116  # we used to calculate it above

In [28]:
# this is how to call the log function from numpy as np
np.log(prob_token_spam)

array([ -4.45593772,  -4.5278221 ,  -4.72646568, ...,  -9.94441621,
        -9.65673414, -10.19573064])

# Joint probability in log format

In [33]:
joint_log_spam = X_test.dot(np.log(prob_token_spam) - np.log(prob_all_token)) + np.log(PROB_SPAM)
joint_log_spam[:5]

array([ 6.24745578, -0.52584466, 12.61619551, 13.3232053 , 17.04939639])

$$ P(NonSpam \, | \, x) = \frac{P(X \, | \, NonSpam \,) \, P(NonSpam)}{P(X)} $$

In [34]:
joint_log_non_spam = X_test.dot(np.log(prob_token_non_spam) - np.log(prob_all_token)) + np.log(1-PROB_SPAM)
joint_log_non_spam[:5]

array([-46.63338735, -10.78450467, -30.55966671, -69.33836163,
       -68.4458674 ])

# Making Predictions

### Checking for the higher joint probability

$$ P(Spam \, | \, x) > P(NonSpam \, | \, x)$$
<br>
<center>**or**</center>
<br>
$$ P(Spam \, | \, x) < P(NonSpam \, | \, x)$$

**Challenge** Can you create the vector of prediction, out y^ ? Remember that spam should have the value 1(true) and non-spam emails should have the value 0(false). Store your result in a variable calles prediction

In [36]:
prediction = joint_log_spam > joint_log_non_spam
prediction

array([ True,  True,  True, ..., False, False, False])

In [37]:
y_test  # look like the same above

array([1., 1., 1., ..., 0., 0., 0.])

## Simplification

$$ P(Spam \, | \, X) = \frac{P(X \, | \, Spam \,) \, P(Spam)}{P(X)} $$

$$ P(NonSpam \, | \, x) = \frac{P(X \, | \, NonSpam \,) \, P(NonSpam)}{P(X)} $$

Did you see something it is divide with the smae fraction P(X) so we can remove it and get the same result

$$\frac{P(X \, | \, NonSpam \,) \, P(NonSpam)}{P(X)} <  \frac{P(X \, | \, Spam \,) \, P(Spam)}{P(X)} $$
or
$$\frac{P(X \, | \, NonSpam \,) \, P(NonSpam)}{P(X)} >  \frac{P(X \, | \, Spam \,) \, P(Spam)}{P(X)} $$

In [38]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)) + np.log(PROB_SPAM)
joint_log_non_spam = X_test.dot(np.log(prob_token_non_spam)) + np.log(1-PROB_SPAM)

In [39]:
new_prediction = joint_log_spam > joint_log_non_spam

In [44]:
((prediction == new_prediction) == False).any()

False