In [297]:
# Import statements
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords



In [298]:
# Paths and Constants
top_words_size = 4000

word_spam_prob_file = 'prob-spam.txt'
word_ham_prob_file = 'prob-ham.txt'
word_all_prob_file ='prob-all.txt'

test_words_matrix = 'test-words.txt'
test_target_file = 'test-target.txt'



In [299]:
# Load the data

# The test word matrix
X_test = np.loadtxt(test_words_matrix, delimiter=' ')
# Target file
y_test = np.loadtxt(test_target_file, delimiter=' ')
# Probabilities
prob_word_spam = np.loadtxt(word_spam_prob_file, delimiter=' ')
prob_word_ham = np.loadtxt(word_ham_prob_file, delimiter=' ')
prob_word_all = np.loadtxt(word_all_prob_file, delimiter=' ')

# Checking the shapes for dot product. 
### Rows of one must be Equal to the Columns of the other 

In [301]:
prob_word_spam.shape


(4000,)

In [302]:
X_test.shape

(1724, 4000)

In [303]:
prob_word_ham.shape

(4000,)

In [304]:
prob_word_all.shape

(4000,)

In [305]:
X_test.dot(prob_word_spam).shape

(1724,)

In [306]:
# Prior (Guess) in Bayesian Statistics 
prob_spam = 0.3113

np.log(prob_word_spam)
np.log(prob_word_all)


array([ -4.22165144,  -4.93380327,  -4.95538474, ..., -10.37650954,
       -10.0400373 , -11.47512183])

In [307]:
# Full probability in log format


full_prob_log_spam = X_test.dot(np.log(prob_word_spam) - np.log(prob_word_all)) + np.log(prob_spam)

In [308]:
full_prob_log_spam[:5]

array([  3.24123515,  16.5789707 ,  19.93657553, -15.1651965 ,
        29.0012179 ])

In [309]:
full_prob_log_ham = X_test.dot(np.log(prob_word_ham) - np.log(prob_word_all)) + np.log(1-prob_spam)

In [310]:
full_prob_log_ham[:5]

array([-11.06804468, -42.9161444 , -38.97265488,  -4.18707704,
       -64.95202787])

In [311]:
full_prob_log_ham.shape

(1724,)

In [312]:
full_prob_log_spam.shape

(1724,)

# Predictions

In [313]:
# Check for higher probability

prediction = full_prob_log_spam > full_prob_log_ham



In [314]:
prediction == y_test

array([ True,  True,  True, ...,  True,  True,  True])

In [315]:
correct_predictions = (prediction == y_test).sum()

In [316]:
correct_predictions

1675

In [317]:
incorrect_predictions = y_test.size-correct_predictions

In [318]:
incorrect_predictions

49

In [319]:
y_test.shape


(1724,)

# Evaluation

## Find the accuracy of the predictions

In [320]:
# Precentage of accurate predictions 
correct_predictions/1724 * 100

97.15777262180974

In [321]:
prob_word_all.shape

(4000,)

# Evaluation

# Find the True, False : Positives and Negatives

### True Positive

In [322]:
true_positive = (y_test ==1) & (prediction ==1)

In [323]:
true_positive.sum()

551

### False Positive

In [324]:
false_positive = (y_test == 0) & (prediction == 1)

In [325]:
false_positive.sum()

11

### False Negatives

In [326]:
false_negative = (y_test == 1) & (prediction==0)

In [327]:
false_negative.sum()

38

### True Negative

In [328]:
true_negative = (y_test == 0) & (prediction == 0)

In [329]:
true_negative.sum()

1124