# Notebook Imports

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import numpy as np

from os import walk
from os.path import join
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


from bs4 import BeautifulSoup
from wordcloud import WordCloud
from PIL import Image

from sklearn.model_selection import train_test_split

%matplotlib inline

# Constants

In [57]:
VOCABULARY_WORD_SIZE = 2500

TRAINING_DATA_FILE = "C:/Users/BediZ/DS-ML/Jupyter Notebook - ML Projects/resources/SpamData/02_Training/train-data.txt"
TEST_DATA_FILE = "C:/Users/BediZ/DS-ML/Jupyter Notebook - ML Projects/resources/SpamData/02_Training/test-data.txt"

TOKEN_SPAM_PROBABILITY_DATA_FILE = "C:/Users/BediZ/DS-ML/Jupyter Notebook - ML Projects/resources/SpamData/03_Testing/spam-probability.txt"
TOKEN_HAM_PROBABILITY_DATA_FILE = "C:/Users/BediZ/DS-ML/Jupyter Notebook - ML Projects/resources/SpamData/03_Testing/ham-probability.txt"
TOKEN_ALL_PROBABILITY_DATA_FILE = "C:/Users/BediZ/DS-ML/Jupyter Notebook - ML Projects/resources/SpamData/03_Testing/all-probability.txt"

TEST_FEATURE_MATRIX_DATA_FILE = "C:/Users/BediZ/DS-ML/Jupyter Notebook - ML Projects/resources/SpamData/03_Testing/test-features.txt"
TEST_TARGET_MATRIX_DATA_FILE = "C:/Users/BediZ/DS-ML/Jupyter Notebook - ML Projects/resources/SpamData/03_Testing/test-target.txt"



# Read and Load Fetures from .txt Files into NumPy Array

In [5]:
sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter=' ', dtype=int)
sparse_test_data = np.loadtxt(TEST_DATA_FILE, delimiter=' ', dtype=int)

In [6]:
sparse_train_data[:5]
sparse_train_data[-5:]
sparse_test_data[:5]
sparse_test_data[-5:]

array([[5191, 2283,    0,    1],
       [5191, 2398,    0,    1],
       [5191, 2448,    0,    1],
       [5191, 2473,    0,    1],
       [5191, 2492,    0,    2]])

In [14]:
print('Number of rows in training file: ', sparse_train_data.shape[0])
print('Number of rows in test file: ', sparse_test_data.shape[0])

Number of rows in training file:  273836
Number of rows in test file:  113452


In [15]:
print('Number of unique emails in training file: ', np.unique(sparse_train_data[:, 0]).size)
print('Number of unique emails in test file: ', np.unique(sparse_test_data[:, 0]).size)


Number of unique emails in training file:  3554
Number of unique emails in test file:  1524


## Create a Empty DataFrame

In [16]:
column_names = ['DOC_ID'] +  ['CATEGORY'] + list(range(0, VOCABULARY_WORD_SIZE))
len(column_names)
column_names[:5]

['DOC_ID', 'CATEGORY', 0, 1, 2]

In [17]:
index_names = np.unique(sparse_train_data[:, 0])
index_names

array([   0,    1,    2, ..., 5184, 5186, 5189])

In [18]:
full_train_data = pd.DataFrame(index=index_names, columns=column_names)
full_train_data.head()
full_train_data.fillna(value=0, inplace = True)

## Create a Full Matrix from Sparce Matrix

In [19]:
def make_full_matrix(sparse_matrix, 
                     number_of_words, 
                     document_index = 0, 
                     word_index = 1, 
                     category_index = 2, 
                     word_occurance_index = 3):
    """
    Form a full matrix from a sparse matrix. Return a pandas DataFrame.
    sparse_matrix: numpy array
    number_of_words: size of the vocabulary. Total number of tokens.
    document_index: position of the document id in the sparse matrix. Default is the 1st column
    word_index: position of the word id in the sparse matrix. Default is the 2st column
    category_index: position of the category id in the sparse matrix. Default is the 3st column
    word_occurance_index: position of the occurance id in the sparse matrix. Default is the 4st column
    """
    
    column_names = ['DOC_ID'] +  ['CATEGORY'] + list(range(0, VOCABULARY_WORD_SIZE))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace = True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_number = sparse_matrix[i][document_index]
        word_id = sparse_matrix[i][word_index]
        label = sparse_matrix[i][category_index]
        occurrence = sparse_matrix[i][word_occurance_index]
        
        full_matrix.at[doc_number, 'DOC_ID'] = doc_number
        full_matrix.at[doc_number, 'CATEGORY'] = label
        full_matrix.at[doc_number, word_id] = occurrence
        
    full_matrix.set_index('DOC_ID', inplace = True)
    return full_matrix    

In [20]:
%%time
full_train_data = make_full_matrix(sparse_train_data, VOCABULARY_WORD_SIZE)

Wall time: 8.91 s


In [21]:
full_train_data.tail()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5181,0,2,2,7,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5183,0,2,1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5184,0,0,2,0,1,4,3,4,6,0,...,0,0,0,0,0,0,0,0,0,0
5186,0,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5189,0,1,0,1,1,0,1,9,0,0,...,0,0,0,0,0,0,0,0,0,0


# Training the Naive Bayes Model

## Calculating the probability of Spam

In [22]:
full_train_data.CATEGORY.size

3554

In [23]:
full_train_data.CATEGORY.sum()

2586

In [24]:
prob_spam = full_train_data.CATEGORY.sum() / full_train_data.CATEGORY.size
print('Porbability of spam in email is: ', round(prob_spam, 4)*100, '%')

Porbability of spam in email is:  72.76 %


## Total Number od Words / Tokens

In [25]:
full_train_features = full_train_data.loc[:, full_train_data.columns != 'CATEGORY']
full_train_features

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,1,3,1,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,2,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,2,0,13,4,0,0,4,1,3,...,0,0,0,0,0,0,0,0,0,0
4,5,0,2,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5181,2,2,7,0,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5183,2,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5184,0,2,0,1,4,3,4,6,0,3,...,0,0,0,0,0,0,0,0,0,0
5186,1,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
email_token_sums = full_train_features.sum(axis = 1)
email_token_sums

DOC_ID
0        87
1        54
2        42
3       187
4        44
       ... 
5181    135
5183     38
5184    551
5186     72
5189    209
Length: 3554, dtype: int64

In [27]:
total_word_count = email_token_sums.sum()
total_word_count

505499

## Number of Tokens in Spam and Ham Emails

In [28]:
spam_lengths = email_token_sums[full_train_data.CATEGORY == 1]
spam_lengths.shape

(2586,)

In [29]:
spam_word_count = spam_lengths.sum()
spam_word_count

403636

In [None]:
ham_lengths = email_token_sums[full_train_data.CATEGORY == 0]
ham_lengths.shape



In [31]:
ham_word_count = ham_lengths.sum()
ham_word_count

101863

In [32]:
## check if calculation is ok, must be 0
email_token_sums.shape[0] - ham_lengths.shape[0] - spam_lengths.shape[0]

0

In [33]:
print('Average number of words in spam emails {:.0f}' .format(spam_word_count / spam_lengths.shape[0]))
print('Average number of words in ham emails {:.0f}' .format(ham_word_count / ham_lengths.shape[0]))

Average number of words in spam emails 156
Average number of words in ham emails 105


## Summing the Tokens Occuring in Spam

In [41]:
spam_tokens_train = full_train_features.loc[full_train_data.CATEGORY == 1 ]
spam_tokens_train_sum = spam_tokens_train.sum(axis=0) + 1
spam_tokens_train_sum.head()

ham_tokens_train = full_train_features.loc[full_train_data.CATEGORY == 0 ]
ham_tokens_train_sum = ham_tokens_train.sum(axis=0) + 1
ham_tokens_train_sum.head()

0    2232
1     404
2    1040
3     107
4     628
dtype: int64

## P(Token | Spam) - Probability that a Token Occurs given the Email is Spam

In [42]:
spam_token_probability = spam_tokens_train_sum / (spam_word_count + VOCABULARY_WORD_SIZE)
spam_token_probability[:5]

0    0.010725
1    0.010698
2    0.006407
3    0.008889
4    0.006678
dtype: float64

In [45]:
spam_token_probability.sum()

0.9999999999999999

## P(Token | Ham) - Probability that a Token Occurs given the Email is Ham

In [46]:
ham_token_probability = ham_tokens_train_sum / (ham_word_count + VOCABULARY_WORD_SIZE)
ham_token_probability[:5]

0    0.021387
1    0.003871
2    0.009965
3    0.001025
4    0.006017
dtype: float64

In [47]:
ham_token_probability.sum()

1.0

## P(Token) - Probability that a Token Occurs

In [48]:
all_token_probability = full_train_features.sum(axis=0) / total_word_count
all_token_probability.sum()

1.0

## Save the Trained Model

In [50]:
np.savetxt(TOKEN_SPAM_PROBABILITY_DATA_FILE, spam_token_probability)
np.savetxt(TOKEN_HAM_PROBABILITY_DATA_FILE, ham_token_probability)
np.savetxt(TOKEN_ALL_PROBABILITY_DATA_FILE, all_token_probability)

## Prepare Test Data

In [52]:
sparse_test_data.shape

(113452, 4)

In [54]:
%%time
full_test_data =make_full_matrix(sparse_test_data, number_of_words=VOCABULARY_WORD_SIZE)

Wall time: 3.21 s


In [59]:
X_test = full_test_data.loc[:, full_test_data.columns != 'CATEGORY']
y_test = full_test_data.CATEGORY

In [61]:
np.savetxt(TEST_TARGET_MATRIX_DATA_FILE, y_test)
np.savetxt(TEST_FEATURE_MATRIX_DATA_FILE, X_test)
