In [53]:
import sys
import os
import numpy as np
import random
import re
import pytz
import collections
import cntk as C 
from stop_words import get_stop_words
from sklearn import preprocessing
from email.parser import Parser
from datetime import datetime

In [2]:
def load_emails(path):
    # Reads emails from foulders where foulder is a category of emails in it
    categories = os.listdir(path)
    for c in categories:
        if c[0] == '.':
            categories.remove(c)
    emails = []
    for direc in categories:
        files = os.listdir(path + "/" + direc)
        f = []
        for file in files:
            file = open(path + "/" + direc + "/" + file)
            parser = Parser()
            email = parser.parse(file)
            f.append(email)
        emails.append(f)
    return categories, emails

In [3]:
def relevant(emails, l):
    # returns list of length l of relevant words that are 
    # the most common in the documents
    words = []
    for fold in emails:
        for email in fold:
            words += clear_text(email.get_payload())

    counter = collections.Counter(words)
    return [i[0] for i in counter.most_common()][:l]

In [4]:
stop = get_stop_words('en')
def clear_text(email):
    # cleares text and returns lowercase relevant words
    text = ''.join([i for i in email if i.isalpha() or i.isspace()])
    text = re.split(r'[\n \t]+', text.lower())
    text = [i for i in text if i not in stop and len(i) > 2]
    return text

In [77]:
universal = pytz.timezone ("UTC")
beginning_of_times = datetime(2000, 1, 1, 0, 0, 0, 0, universal)

def get_date(email):
    # extracts date from email and converts it to a feature
    date = datetime.strptime(email['Date'][:-6], '%a, %d %b %Y %H:%M:%S %z')
    return date

In [93]:
def make_dataset(categories, folders, n):
    # creates a vector from each document
    words = relevant(folders, n)
    print("Extracted relavent words")
    
    data = []
    for fold, cat in zip(folders, categories):
        for email in fold:
            date = get_date(email)
            email = clear_text(email.get_payload())
            features = []
            for i in range(n):
                if words[i] in email:
                    features.append(i)
            data.append(features + [date, categories.index(cat)])
            
        print("finished " + cat)
        
    return data

In [137]:
# Save the data file for CNTK reader
def savetxt(filename, ndarray):
    dir = os.path.dirname(filename)
    
    print("Saving", filename )
    with open(filename, 'w') as f:
        labels = list(map(' '.join, np.eye(num_classes, dtype=np.uint).astype(str)))
        for row in ndarray:
            label_str = labels[row[-1]]
           # date = row[-2]
            date = get_time_feat(row[-2])
            feature_str = ' '.join([str(i)+":1" for i in row[:-2]] + \
                                   [str(input_dim + i + 1) + ":" + str(date[i]) for i in range(len(date))])
            
            line = '|labels {} |features {}\n'.format(label_str, feature_str)
            f.write(line)
            #print(line)

In [160]:
def get_time_feat(date):
    date_f = []
   # delta = beginning_of_times-date.astimezone(universal)
   # date_f += [delta.total_seconds()]
   # date_f += [delta.days]
   # date_f += [date.year]
   # date_f += [date.month]
   # date_f += [date.isoweekday()]
    date_f += [date.day]
    return date_f

## To read emails from folders

In [161]:
categories, folders = load_emails("enron")

In [162]:
num_classes = len(categories)
#num_classes = 17
# also length of dictonary
#input_dim = 5106
input_dim = 1000 

In [163]:
data = make_dataset(categories, folders, input_dim)

Extracted relavent words
finished inbox
finished contacts
finished sent_items
finished operations_committee_isas
finished human_resources
finished settlements
finished preschedule
finished symesees
finished schedule_crawler
finished el_paso
finished personal
finished bill_williams_iii
finished calendar
finished hr
finished enron_messages
finished rt_strat
finished bill
finished california_messages
finished timbelden
finished canada
finished tie_meter_multipliers
finished gwolfe
finished rt_cuts
finished forney


In [164]:
# save data for training and testing
random.shuffle(data)
n = len(data) // 10 * 8
train, test = data[:n], data[n:]
savetxt("trainData/train.txt", train)
savetxt("trainData/test.txt", test)

Saving trainData/train.txt
Saving trainData/test.txt


In [165]:
input_dim += 6

## Load actual data and split to train / test 

In [23]:
testData = open("trainData/trainData.txt")
testData = testData.readlines()
random.shuffle(testData)
n = len(testData) // 10 * 8
train, test = testData[:n], testData[n:]

with open("trainData/train.txt", 'w') as f:
    for row in train:
        f.write(row)

with open("trainData/test.txt", 'w') as f:
    for row in test:
        f.write(row)

# CNTK 

In [166]:
# Read a CTF formatted text 
def create_reader(path, is_training, input_dim, num_label_classes):
    return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
        labels = C.io.StreamDef(field='labels', shape=num_label_classes, is_sparse=False),
        features   = C.io.StreamDef(field='features', shape=input_dim, is_sparse=True)
    )), randomize = is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)

In [167]:
num_hidden_layers = 2
hidden_layers_dim = num_classes * 10

input = C.input_variable(input_dim)
label = C.input_variable(num_classes)

In [168]:
def create_model(features):
    with C.layers.default_options(init = C.layers.glorot_uniform(), activation = C.ops.relu):
            h = features
            for _ in range(num_hidden_layers):
                h = C.layers.Dense(hidden_layers_dim)(h)
            r = C.layers.Dense(num_classes, activation = None)(h)
            return r
        
z = create_model(input)

In [169]:
loss = C.cross_entropy_with_softmax(z, label)
label_error = C.classification_error(z, label)

learning_rate = 0.2
lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)
learner = C.sgd(z.parameters, lr_schedule)
trainer = C.Trainer(z, (loss, label_error), [learner])

In [170]:
# Define a utility function to compute the moving average sum.
# A more efficient implementation is possible with np.cumsum() function
def moving_average(a, w=5):
    if len(a) < w:
        return a[:]    # Need to send a copy of the array
    return [val if idx < w else sum(a[(idx-w):idx])/w for idx, val in enumerate(a)]


# Defines a utility that prints the training progress
def print_training_progress(trainer, mb, frequency, verbose=1):
    training_loss = "NA"
    eval_error = "NA"

    if mb%frequency == 0:
        training_loss = trainer.previous_minibatch_loss_average
        eval_error = trainer.previous_minibatch_evaluation_average
        if verbose: 
            print ("Minibatch: {0}, Loss: {1:.4f}, Error: {2:.2f}%".format(mb, training_loss, eval_error*100))
        
    return mb, training_loss, eval_error

In [174]:
minibatch_size = 100
num_samples_per_sweep = 5000#len(train)
num_sweeps_to_train_with = 100
num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) / minibatch_size
train_file = os.path.join("trainData/train.txt")

In [175]:
# reader
reader_train = create_reader(train_file, True, input_dim, num_classes)

# input map
input_map = {
    label  : reader_train.streams.labels,
    input  : reader_train.streams.features
} 

# Run the trainer 
training_progress_output_freq = 500

for i in range(0, int(num_minibatches_to_train)):
    data = reader_train.next_minibatch(minibatch_size, input_map = input_map)
    
    trainer.train_minibatch(data)
    batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq, verbose=1)

Minibatch: 0, Loss: nan, Error: 95.00%


KeyboardInterrupt: 

In [124]:
test_file = os.path.join("trainData/test.txt")
reader_test = create_reader(test_file, False, input_dim, num_classes)
input_map = {
    label  : reader_test.streams.labels,
    input  : reader_test.streams.features
} 

data = reader_test.next_minibatch(1000000, input_map = input_map)
error = trainer.test_minibatch(data)
print('Error rate on an unseen minibatch: {:.2f}%'.format(error*100))

Error rate on an unseen minibatch: 56.83%
