# Text Classification
*Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. Please check the pdf file for more details.*

In this exercise you will:
    
- implement a of spam classifier with **Naive Bayes method** for real world email messages
- learn the **training and testing phase** for Naive Bayes classifier  
- get an idea of the **precision-recall** tradeoff

In [2]:
# some basic imports
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [3]:
# ham_train contains the occurrences of each word in ham emails. 1-by-N vector
ham_train = np.loadtxt('ham_train.csv', delimiter=',')
# spam_train contains the occurrences of each word in spam emails. 1-by-N vector
spam_train = np.loadtxt('spam_train.csv', delimiter=',')
# N is the size of vocabulary.
N = ham_train.shape[0]
# There 9034 ham emails and 3372 spam emails in the training samples
num_ham_train = 9034
num_spam_train = 3372
# Do smoothing, Forcibly add one????????
x = np.vstack([ham_train, spam_train]) + 1

# ham_test contains the occurences of each word in each ham test email. P-by-N vector, with P is number of ham test emails.
i,j,ham_test = np.loadtxt('ham_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
ham_test_tight = scipy.sparse.coo_matrix((ham_test, (i - 1, j - 1)))
ham_test = scipy.sparse.csr_matrix((ham_test_tight.shape[0], ham_train.shape[0]))
ham_test[:, 0:ham_test_tight.shape[1]] = ham_test_tight
# spam_test contains the occurences of each word in each spam test email. Q-by-N vector, with Q is number of spam test emails.
i,j,spam_test = np.loadtxt('spam_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
spam_test_tight = scipy.sparse.csr_matrix((spam_test, (i - 1, j - 1)))
spam_test = scipy.sparse.csr_matrix((spam_test_tight.shape[0], spam_train.shape[0]))
spam_test[:, 0:spam_test_tight.shape[1]] = spam_test_tight




## Now let's implement a ham/spam email classifier. Please refer to the PDF file for details

In [4]:
from likelihood import likelihood
import heapq 
# TODO
# Implement a ham/spam email classifier, and calculate the accuracy of your classifier

# Hint: you can directly do matrix multiply between scipy.sparse.coo_matrix and numpy.array.
# Specifically, you can use sparse_matrix * np_array to do this. Note that when you use "*" operator
# between numpy array, this is typically an elementwise multiply.

# begin answer
# ham_train[w]=num, the w-th item represent the occurences w-id word
# ham_test[i,j]=num, i is the id of e-mail, j is the id of word, num is the num of occurences of j-id word in i-id email, a sparse matrix
total_num = np.sum(x, axis=1)
p_x = (x.T/total_num).T
# all item in p_x[0,:] > 0, no devide err
ratio = p_x[1,:]/ p_x[0,:]
word_idx_h10 = heapq.nlargest(10, range(ratio.shape[0]), ratio.take)
word_map = {}
with open('all_word_map.txt','r') as f:
    lines = f.readlines()
    for line in lines:
        word, idx = line.split('\t')
        word_map.update({int(idx)-1:word})

for idx in word_idx_h10:
    print('id ',idx, word_map[idx], ': ', ratio[idx],  spam_train[idx], '/spam', ham_train[idx], '/ham')
# end answer

id  30032 nbsp :  1325.1002358991152 385.0 /spam 0.0 /ham
id  75525 viagra :  1249.5763882571969 363.0 /spam 0.0 /ham
id  38175 pills :  1101.9615951389017 320.0 /spam 0.0 /ham
id  45152 cialis :  847.9268348888121 246.0 /spam 0.0 /ham
id  9493 voip :  837.6281283921868 243.0 /spam 0.0 /ham
id  65397 php :  768.9700850813518 223.0 /spam 0.0 /ham
id  37567 meds :  672.8488244461829 195.0 /spam 0.0 /ham
id  13612 computron :  652.2514114529324 189.0 /spam 0.0 /ham
id  56929 sex :  614.4894876319731 178.0 /spam 0.0 /ham
id  9452 ooking :  518.3682269968041 150.0 /spam 0.0 /ham


In [42]:
lh = likelihood(x) # now lh is p(x_i|c)
prior = np.asarray([num_ham_train, num_spam_train]) / (num_ham_train+num_spam_train) # prior prob
log_lh = np.log(lh)
# to avoid underflow, replace \prod p(x_i|c) * p(c) with ln(\prod p(x_i|c) * p(c)) which is \sum ln(p(x_i|c)) + ln(p(c))

# for ham_test
ham_test_dense = ham_test.todense()
lh_ham_doc = np.dot(log_lh, ham_test_dense.T)
post_ham_doc = (lh_ham_doc.T + np.log(prior)).T
err = np.sum(np.argmax(post_ham_doc, axis=0))
print(err, '/', post_ham_doc.shape[1])

# for ham_test
spam_test_dense = spam_test.todense()
lh_spam_doc = np.dot(log_lh, spam_test_dense.T)
post_spam_doc = (lh_spam_doc.T + np.log(prior)).T
err = np.sum(1-np.argmax(post_spam_doc, axis=0))
print(err, '/', post_spam_doc.shape[1])

28 / 3011
31 / 1124


In [38]:
print(post_ham_doc[:10])
print(post_spam_doc[:10])

[[-8.63432865e+02 -1.25786303e+03 -3.71034376e+03 ... -7.34837598e+02
  -6.10116523e+01 -3.39988551e+04]
 [-3.89812600e+02 -5.39209481e+02 -1.63046145e+03 ... -2.85201701e+02
  -3.24767904e+01 -1.45029748e+04]]
[[ -575.70594534 -2922.10184967 -3124.69830799 ...  -435.66099882
   -787.64474746  -953.19199775]
 [ -194.29733022  -976.62839931 -1108.65874498 ...  -146.2524857
   -280.35154577  -305.25967185]]
