1.	Aggregate the data into one labeled data set.
2.	Store the labels in a separate file with the following on a separate line for each email file: 
EMAIL_ID	LABEL (with a tab in between)
3.	Divide the data into 10 “folds” (or subsets)
4.	Iteratively hold out one of the folds as test data.  Train on the other 9.
5.	Record the results in a master results file.  The form should be
EMAIL_ID	LABEL        CLASSIFIED_AS (with tabs in between)
6.	By the end of all 10 experiments, all of the email files will have been part of a test set, and your master results file will contain results for the whole data set
7.	Compute Precision, Recall, F-score and Accuracy for the complete experiment.
8.	Repeat the 10-fold experiment again, but don’t do smoothing.  Instead, just ignore any unseen words.  That is, just don’t add them to your running total of log-probs.  Compute all stats.  Does smoothing matter for this problem or not?

In [94]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import re
import pandas as pd
import seaborn as sn
from util import extract_word, get_words, process_files, predict, summary, get_truths

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [83]:
ham_dir = 'ham'
ham_filelist = ham_dir + '/hamFileList.txt'
spam_dir = 'spam'
spam_filelist = spam_dir + '/spamFileList.txt'
test_dir = 'test'
truth_file = test_dir + '/truth'

ham_files = []
spam_files = []
test_files = [(test_dir + '/' + str(x)) for x in range(1, 101)]

with open(ham_filelist, 'r') as f:
  for line in f:
    ham_files.append(ham_dir + '/' + str(int(line)))

with open(spam_filelist, 'r') as f:
  for line in f:
    spam_files.append(spam_dir + '/' + str(int(line)))

In [84]:
truths = get_truths(truth_file)

In [85]:
# 1. Aggregate the data into one labeled data set.
all_labels = {}
all_files = []
for ham in ham_files:
  all_files.append(ham)
  all_labels[ham] = 'Ham'
for spam in spam_files:
  all_files.append(spam)
  all_labels[spam] = 'Spam'
for f,l in truths.iteritems():
  all_files.append(f)
  all_labels[f] = l

In [86]:
print len(all_labels)
print len(all_files)

1100
1100


In [87]:
# 2. Store the labels in a separate file with the following on a separate line for each email file: 
# EMAIL_ID LABEL (with a tab in between)
with open('all_labels.txt', 'w') as f:
  for k,l in all_labels.iteritems():
    f.write(k + '\t' + l + '\n')

In [88]:
# 3. Divide the data into 10 “folds” (or subsets)
num_folds = 10

import numpy as np
from random import shuffle
shuffle(all_files)
folds = np.array_split(all_files, num_folds)

In [89]:
def process(fold_train, all_labels):
  ham_map = {}
  spam_map = {}
  total_ham_words = 0
  total_spam_words = 0
  
  for file in fold_train:
    if all_labels[file] == 'Ham':
      for w in get_words(file):
        if w in ham_map:
          ham_map[w] = ham_map[w] + 1
        else:
          ham_map[w] = 1
        total_ham_words += 1
    else:
      for w in get_words(file):
        if w in spam_map:
          spam_map[w] = spam_map[w] + 1
        else:
          spam_map[w] = 1
        total_spam_words += 1      
  return ham_map, total_ham_words, spam_map, total_spam_words

In [90]:
# 4. Iteratively hold out one of the folds as test data. Train on the other 9.

all_predictions = {}

all_files_array = np.array(all_files)
fold_size = len(all_files)/num_folds

for fold in range(num_folds):
  fold_test = folds[fold]
  train_mask = range(0,fold_size*fold) + range(fold_size*(fold+1),fold_size*num_folds)
  fold_train = all_files_array[train_mask]

  ham_map, total_ham_words, spam_map, total_spam_words = process(fold_train, all_labels)
  predictions = predict(fold_test, ham_map, total_ham_words, spam_map, total_spam_words, 1, 200000)
  all_predictions.update(predictions)

In [91]:
# 5. Record the results in a master results file. The form should be EMAIL_ID LABEL CLASSIFIED_AS 
# (with tabs in between)
# 6. By the end of all 10 experiments, all of the email files will have been part of a test set, and your master
# results file will contain results for the whole data set

with open('all_predictions.txt', 'w') as f:
  for k,p in all_predictions.iteritems():
    label = all_labels[k]
    f.write(k + '\t' + label + '\t' + p + '\n')

In [92]:
# 7. Compute Precision, Recall, F-score and Accuracy for the complete experiment.
tp, tn, fp, fn, precision, recall, fscore, accuracy = summary(all_predictions, all_labels)

print "True positive = " + str(tp)
print "False positive = " + str(fp)
print "False negative = " + str(fn)
print "True negative = " + str(tn)
print "Precision = " + str(precision)
print "Recall = " + str(recall)
print "F-score = " + str(fscore)
print "Accuracy = " + str(accuracy)

True positive = 499
False positive = 5
False negative = 51
True negative = 545
Precision = 0.990079365079
Recall = 0.907272727273
F-score = 0.946869070209
Accuracy = 0.949090909091


In [95]:
# 8. Repeat the 10-fold experiment again, but don’t do smoothing. Instead, just ignore any unseen words. 
# That is, just don’t add them to your running total of log-probs. Compute all stats. Does smoothing matter 
# for this problem or not?

all_predictions_no_smoothing = {}

for fold in range(num_folds):
  fold_test = folds[fold]
  train_mask = range(0,fold_size*fold) + range(fold_size*(fold+1),fold_size*num_folds)
  fold_train = all_files_array[train_mask]

  ham_map, total_ham_words, spam_map, total_spam_words = process(fold_train, all_labels)
  predictions = predict(fold_test, ham_map, total_ham_words, spam_map, total_spam_words, 1, 200000, smooth=False)
  all_predictions_no_smoothing.update(predictions)

tp, tn, fp, fn, precision, recall, fscore, accuracy = summary(all_predictions_no_smoothing, all_labels)
print "True positive = " + str(tp)
print "False positive = " + str(fp)
print "False negative = " + str(fn)
print "True negative = " + str(tn)
print "Precision = " + str(precision)
print "Recall = " + str(recall)
print "F-score = " + str(fscore)
print "Accuracy = " + str(accuracy)

True positive = 41
False positive = 543
False negative = 509
True negative = 7
Precision = 0.0702054794521
Recall = 0.0745454545455
F-score = 0.0723104056437
Accuracy = 0.0436363636364
