In [7]:
import pandas as pd
import numpy as np

In [8]:
TRAINING_DATA_FILE = "02_Training/train-data.txt"
TEST_DATA_FILE = "02_Training/test-data.txt"

TOKEN_SPAM_PROB_FILE = "03_Testing/prob-spam.txt"
TOKEN_HAM_PROB_FILE = "03_Testing/prob-ham.txt"
TOKEN_ALL_PROB_FILE = "03_Testing/prob-all.txt"

TEST_FEATURE_MATRIX = "03_Testing/test-feature.txt"
TEST_TARGET_FILE = "03_Testing/test-target.txt"

VOCAB_SIZE = 2500

In [9]:
sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter=' ', dtype=int)
sparse_test_data = np.loadtxt(TEST_DATA_FILE, delimiter=' ', dtype=int)

In [10]:
len(np.unique(sparse_test_data[:, 0]))

1724

In [11]:
def make_full_matrix(sparse_matrix, nr_words, doc_idx=0, word_idx=1, cat_idx=2, freq_idx=3):
    column_names = ["DOC_ID"] + ["CATEGORY"] + list(range(VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(0, inplace=True)
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, "DOC_ID"] = doc_nr
        full_matrix.at[doc_nr, "CATEGORY"] = label
        full_matrix.at[doc_nr, word_id] = occurrence
    full_matrix.set_index("DOC_ID", inplace=True)
    return full_matrix
    

In [13]:
full_train_data = make_full_matrix(sparse_train_data, VOCAB_SIZE)

# Find Total # of Tokens (Words)

In [16]:
full_train_features = full_train_data.loc[:, full_train_data.columns!="CATEGORY"]
total_words = full_train_features.sum(axis=1).sum()

# Find # of Tokens in Spam & Ham Emails

In [17]:
spam_train_features = full_train_data.loc[full_train_data["CATEGORY"]!=0, full_train_data.columns!="CATEGORY"]
total_spam_words = spam_train_features.sum(axis=1).sum()
ham_train_features = full_train_data.loc[full_train_data["CATEGORY"]!=1, full_train_data.columns!="CATEGORY"]
total_ham_words = ham_train_features.sum(axis=1).sum()
print(total_spam_words, total_ham_words)

178663 252325


# Find # of Certain Kinds of Tokens in Spam & Ham Emails

In [19]:
summed_spam_tokens = (full_train_features.loc[full_train_data["CATEGORY"]!=0]).sum(axis=0)+1
summed_ham_tokens = (full_train_features.loc[full_train_data["CATEGORY"]!=1]).sum(axis=0)+1
print(summed_spam_tokens, summed_ham_tokens)

0       2177
1        936
2       1220
3       2026
4       1211
        ... 
2495      23
2496       2
2497       4
2498       2
2499      25
Length: 2500, dtype: int64 0       5483
1       2588
2       2044
3        938
4       1611
        ... 
2495       1
2496      22
2497      30
2498      37
2499       3
Length: 2500, dtype: int64


# Finding P(Token | Spam)

In [20]:
prob_tokens_spam = summed_spam_tokens/(total_spam_words+2500)
prob_tokens_ham = summed_ham_tokens/(total_ham_words+2500)
print(prob_tokens_spam, prob_tokens_ham)

0       0.012017
1       0.005167
2       0.006734
3       0.011183
4       0.006685
          ...   
2495    0.000127
2496    0.000011
2497    0.000022
2498    0.000011
2499    0.000138
Length: 2500, dtype: float64 0       0.021517
1       0.010156
2       0.008021
3       0.003681
4       0.006322
          ...   
2495    0.000004
2496    0.000086
2497    0.000118
2498    0.000145
2499    0.000012
Length: 2500, dtype: float64


# Finding P(Token)

In [21]:
prob_tokens_all = full_train_features.sum(axis=0)/total_words

In [22]:
np.savetxt(TOKEN_SPAM_PROB_FILE, prob_tokens_spam)
np.savetxt(TOKEN_HAM_PROB_FILE, prob_tokens_ham)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_tokens_all)

# Saving Test Data

In [23]:
full_test_data = make_full_matrix(sparse_test_data, VOCAB_SIZE)

In [24]:
full_test_features = full_test_data.loc[:, full_test_data.columns!="CATEGORY"]
full_test_target = full_test_data["CATEGORY"]
np.savetxt(TEST_FEATURE_MATRIX, full_test_features)
np.savetxt(TEST_TARGET_FILE, full_test_target)