# Notebook import

In [1]:
import pandas as pd
import numpy as np

# Constants

In [60]:
SPAM = 1
NON_SPAM = 0

DATA_PATH = 'email-data.json'
TRAINING_DATA_FILE = 'data-grouped/train-data.txt'
TEST_DATA_FILE = 'data-grouped/test-data.txt'

TOKEN_SPAM_PROB_FILE = 'token-data/prob-spam.txt'
TOKEN_NON_SPAM_PROB_FILE = 'token-data/prob-non-spam.txt'
TOKEN_ALL_PROB_FILE = 'token-data/prob-all-token.txt'

TEST_FEATURE_MATRIX = 'test-features.txt'
TEST_TARGET_FILE = 'test-target.txt'

VOCAB_FILE = 'spam_word.csv'
VOCAB_SIZE = 2500

# Read and Load Feature from .txt file into NumPy array

In [3]:
sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter=' ', dtype=int)
sparse_train_data = pd.DataFrame(sparse_train_data, columns=['DOC_ID', 'WORD_ID', 'LABEL', 'OCCURENCE'])
sparse_test_data = np.loadtxt(TEST_DATA_FILE, delimiter=' ', dtype=int)
sparse_test_data = pd.DataFrame(sparse_test_data, columns=['DOC_ID', 'WORD_ID', 'LABEL', 'OCCURENCE'])
print(sparse_train_data)
print(sparse_test_data)

        DOC_ID  WORD_ID  LABEL  OCCURENCE
0            0        1      1          2
1            0        2      1          3
2            0        3      1          2
3            0        5      1          1
4            0        6      1          1
...        ...      ...    ...        ...
241948    5795     2077      0          1
241949    5795     2164      0          7
241950    5795     2271      0          1
241951    5795     2360      0          1
241952    5795     2421      0          1

[241953 rows x 4 columns]
        DOC_ID  WORD_ID  LABEL  OCCURENCE
0            8        1      1          4
1            8        2      1          4
2            8        4      1          5
3            8        5      1          1
4            8        6      1          2
...        ...      ...    ...        ...
110983    5793     1918      0          1
110984    5793     1982      0          3
110985    5793     2209      0          1
110986    5793     2252      0          1
110987 

In [4]:
print('Number of row in training file :', sparse_train_data.shape[0])
print('Number of row in test file :', sparse_test_data.shape[0])
print('Number of emails in training file :', len(set(sparse_train_data['DOC_ID'])))
print('Number of emails in test file :', len(set(sparse_test_data['DOC_ID'])))

Number of row in training file : 241953
Number of row in test file : 110988
Number of emails in training file : 4015
Number of emails in test file : 1724


In [5]:
spam_word = pd.read_csv(VOCAB_FILE)
spam_word.index.name = 'WORD_ID'
spam_word = spam_word.drop(['Unnamed: 0'], axis=1)
spam_word

Unnamed: 0_level_0,WORD
WORD_ID,Unnamed: 1_level_1
0,http
1,email
2,free
3,click
4,receiv
...,...
2495,ghetto
2496,indian
2497,hawaiian
2498,civilian


# Convert Sparse Matrix to Full Matrix

In [6]:
# this is example for createing full matrix
columns_name = ['DOC_ID', 'CATEGORY'] + list(range(0, VOCAB_SIZE))
index_name = set(sparse_train_data['DOC_ID'])
full_train_data = pd.DataFrame(index=index_name, columns=columns_name)
full_train_data = full_train_data.fillna(0)
full_train_data

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5789,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5791,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5794,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
def get_full_matrix(sparse_matrix, spam_word, vocab_size=VOCAB_SIZE,
                   doc_id_idx=0, word_id_idx=1, cate_idx=2, occurence_idx=3):
    """
    Form the full matrix from a sparse matric. Return a pandas DataFrame.
    keyword arguments:
        sparse_matrix -- sparse matrix contain doc_id, word_id, category, occurence
        spam_word
        doc_id_idx -- position of document id in the sparse matrix. Default 1st column
        word_id_idx -- position of the word id in the sparse matrix. Default 2nd column
        cate_idx -- position of label (spam(1) or non-spam(0)). Defualt 3rd column
        occrence_idx -- position of occurence of word in sparse matrix. Default 4th column
    """
    columns_name = ['DOC_ID', 'CATEGORY'] + list(range(0, vocab_size))
    index_name = set(sparse_matrix['DOC_ID'])  # index come from doc_id
    full_train_data = pd.DataFrame(index=index_name, columns=columns_name)
    full_train_data = full_train_data.fillna(0)
    
    number_of_row = sparse_matrix.shape[0]
    for i in range(number_of_row):
        doc_id = sparse_matrix.at[i, 'DOC_ID']
        word_id = sparse_matrix.at[i, 'WORD_ID']
        label = sparse_matrix.at[i, 'LABEL']
        occurence = sparse_matrix.at[i, 'OCCURENCE']
        
        full_train_data.at[doc_id, 'DOC_ID'] = doc_id
        full_train_data.at[doc_id, 'CATEGORY'] = label
        full_train_data.at[doc_id, word_id] = occurence
        
    full_train_data.set_index('DOC_ID', inplace=True)
    
    return full_train_data

In [8]:
%%time
full_train_data = get_full_matrix(sparse_train_data, spam_word)

Wall time: 10.2 s


In [9]:
full_train_data

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,2,3,2,0,1,1,2,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,0,1,0,0,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,6,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,6,2,13,1,4,0,4,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,0,1,0,0,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5789,0,3,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5790,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5791,0,3,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5794,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Training the Naive Bayes Model
## Caculating the Probability of Spam
### Challenge: Calculate hte probability of spam-percenteage of spam messages in training dataset

In [10]:
prob_spam = full_train_data['CATEGORY'].value_counts().loc[1]/full_train_data['CATEGORY'].size
print('probability of spam email :', prob_spam)
print('probability of non-spam email :', 1-prob_spam)

probability of spam email : 0.31133250311332505
probability of non-spam email : 0.688667496886675


### Total Number of Words / Tokens

In [35]:
full_train_features = full_train_data.loc[:, full_train_data.columns != 'CATEGORY']
# [:]  select all the rows
# full_train_data != 'CATEGORY' >> don't inclde category column
email_lengths = full_train_features.sum(axis=1)  # sum along the row >>>>>
full_train_features

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,3,2,0,1,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,1,0,0,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,1,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,2,13,1,4,0,4,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,1,0,0,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5789,3,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5790,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5791,3,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5794,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Number of Tokens in Spam & Non-Spam Emails

### Challenge: Create a subset of the 'email_lengths' series that only contain the spam message. Call the subset spam_lengths. Then, count the total number of word that occur in spam emails

### Challenge: Do the same for the non-spam emails. Create a subset called 'non_spam_lenghts'. Then, count the total number of words that occur in the non-spam emails

In [45]:
spam_lengths = email_lengths[full_train_data['CATEGORY']==1]
non_spam_lengths = email_lengths[full_train_data['CATEGORY']==0]

In [54]:
spam_word_count = spam_lengths.sum(axis=0)
non_spam_word_count = non_spam_lengths.sum(axis=0)
total_word_count = spam_word_count + non_spam_word_count
print('Total of spam word :', spam_word_count)
print('Total of non-spam word :', non_spam_word_count)
print('Average number of word in spam emails : {:.0f}'.format(spam_word_count/spam_lengths.shape[0]))
print('Average number of word in non-spam emails : {:.0f}'.format(non_spam_word_count/non_spam_lengths.shape[0]))

Total of spam word : 185020
Total of non-spam word : 219296
Average number of word in spam emails : 148
Average number of word in non-spam emails : 79


In [48]:
email_lengths.shape[0] - spam_lengths.shape[0] - non_spam_lengths.shape[0]  # check if we subset correctly

0

## Summing the Tokens Occuring in Spam

In [39]:
train_spam_tokens = full_train_features[full_train_data['CATEGORY']==1]
train_spam_tokens

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,3,2,0,1,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,1,0,0,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,1,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,2,13,1,4,0,4,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,1,0,0,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1885,1,2,0,1,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1887,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1889,0,0,0,5,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1890,2,0,2,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
sum_spam_token = train_spam_tokens.sum(axis=0) + 1
# If we not add 1, we got the zero number devide with total number of token
sum_spam_token

0       2177
1       2026
2       1661
3       1452
4       1308
        ... 
2495       6
2496      15
2497       9
2498      12
2499       7
Length: 2500, dtype: int64

In [41]:
# Do the same with non-spam message
train_non_spam_tokens = full_train_features[full_train_data['CATEGORY']==0]
sum_non_spam_token = train_non_spam_tokens.sum(axis=0) + 1

In [66]:
sum_non_spam_token

0       5483
1        938
2        412
3         65
4        346
        ... 
2495       2
2496      78
2497       1
2498      13
2499      90
Length: 2500, dtype: int64

## P(Token | Spam) - Probability that a Tokren Occurs given the Email is Spam

In [52]:
prob_token_spam = sum_spam_token / (spam_word_count + VOCAB_SIZE)  # for each spam_word_count's row we add 1 
# so after add 1 there's 2500 rows >> equal to we add word up 2500, qual to VOCAB_SIZE
print(prob_token_spam)
print('Total probability :', prob_token_spam.sum())

0       0.011609
1       0.010804
2       0.008858
3       0.007743
4       0.006975
          ...   
2495    0.000032
2496    0.000080
2497    0.000048
2498    0.000064
2499    0.000037
Length: 2500, dtype: float64
Total probability : 1.0


## P(Token | Non-Spam) - Probability that a Tokren Occurs given the Email is Non-Spam

In [53]:
prob_token_non_spam = sum_non_spam_token / (non_spam_word_count + VOCAB_SIZE)
print(prob_token_non_spam)
print('Total probability :', prob_token_non_spam.sum())

0       0.024721
1       0.004229
2       0.001858
3       0.000293
4       0.001560
          ...   
2495    0.000009
2496    0.000352
2497    0.000005
2498    0.000059
2499    0.000406
Length: 2500, dtype: float64
Total probability : 1.0


## P(Token) - Probability that Token Occurs

In [67]:
prob_token_all = (full_train_features.sum(axis=0) + 1) / (total_word_count + VOCAB_SIZE)
prob_token_all

0       0.018827
1       0.007283
2       0.005093
3       0.003727
4       0.004063
          ...   
2495    0.000017
2496    0.000226
2497    0.000022
2498    0.000059
2499    0.000236
Length: 2500, dtype: float64

## Save the Trained Model

In [68]:
np.savetxt(TOKEN_SPAM_PROB_FILE, prob_token_spam)
np.savetxt(TOKEN_NON_SPAM_PROB_FILE, prob_token_non_spam)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_token_all)

## Prepare Test Data

In [61]:
full_test_data = get_full_matrix(sparse_test_data, spam_word)
full_test_data

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,1,0,4,4,0,5,1,2,2,0,...,0,0,0,0,0,0,0,0,0,0
12,1,6,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,1,0,1,0,1,0,1,2,...,0,0,0,0,0,0,0,0,0,0
15,1,0,1,3,0,3,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
17,1,0,0,0,0,1,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5783,0,2,2,0,2,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5786,0,5,2,0,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5788,0,0,2,1,0,6,0,4,2,0,...,0,0,0,0,0,0,0,0,0,0
5792,0,2,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
X_test = full_test_data.loc[:, full_test_data.columns != 'CATEGORY']
y_test = full_test_data['CATEGORY']

In [63]:
np.savetxt(TEST_TARGET_FILE, y_test)
np.savetxt(TEST_FEATURE_MATRIX, X_test)