In [1]:
import sys
import os
import numpy as np
import random
import re
import pytz
from datetime import datetime
from cntk import Trainer, Axis
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs,\
        INFINITELY_REPEAT
from cntk.learners import sgd, learning_rate_schedule, UnitType
from cntk import input_variable, cross_entropy_with_softmax, \
        classification_error, sequence
from cntk.logging import ProgressPrinter
from cntk.layers import Sequential, Embedding, Recurrence, LSTM, Dense
from sklearn import preprocessing
from email.parser import Parser

In [2]:
def load_emails(path):
    # Reads emails from foulders where foulder is a category dataset
    categories = os.listdir(path)
    for c in categories:
        if c[0] == '.':
            categories.remove(c)
    emails = []
    for direc in categories:
        files = os.listdir(path + "/" + direc)
        f = []
        for file in files:
            file = open(path + "/" + direc + "/" + file)
            parser = Parser()
            email = parser.parse(file)
            f.append(email)
        emails.append(f)
    return categories, emails

In [3]:
def load_articles():
    #Reads articles data
    train_p = "data/20ng-train-all-terms.txt"
    test_p = "data/20ng-test-all-terms.txt"

    f_train = open(train_p)
    f_test = open(test_p)
    
    train = f_train.readlines()
    test = f_test.readlines()
    
    f_train.close()
    f_test.close()
    return train, test

In [4]:
def make_dataset(data):
    # Process articles 
    labels = []
    features = []
    
    for n, line in enumerate(data, 1):
        l = line.partition('\t')
        labels.append(l[0])
        features.append(l[2].split(' '))
    return features, labels

In [5]:
def read_embedding(path):
    # Reads CBOW embedding file and converts it to a dictionary 
    file = open(path)
    lines = file.readlines()
    word2vec = {}
    dictionary, vector_len = lines[:-1].pop(0).split(' ')
    dictionary_len = int(dictionary)
    vector_len = int(vector_len)
    for l in lines:
        arr = l[:-1].split(' ')
        w = arr[0]
        arr = arr[1:]
        word2vec[w] = arr
    return word2vec

In [6]:
def one_hot_encode(labels):
    #Reads labels and one hot encode them
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    labels_arr = le.fit_transform(labels)
    
    labels = np.zeros((len(labels_arr), num_classes), dtype="float32")
    for i in range(len(labels_arr)):
        labels[i][labels_arr[i]] = 1
    return labels

In [7]:
def extract_text(data, fields):
    # Extract plain text from all the emails and store it to a text file for CBOW
    file = open("extracted_text", 'w')
    addresses = open("addresses", 'w')
    for clas in data:
        for email in clas:
            text = email.get_payload()
            file.write(clear_text(text))
                        
            # addresses
            for field in fields:
                address = get_addresses(email[field])
                if address != []:
                    addresses.write(' '.join(address) + ' ')

In [8]:
from stop_words import get_stop_words
stop = get_stop_words('en')
def clear_text(email):
    text = ''.join([i for i in email if i.isalpha() or i.isspace()])
    text = re.split(r'[\n \t]+', text.lower())
    text = [i for i in text if i not in stop and len(i) > 2]
    return ' '.join(text) + ' '

In [9]:
universal = pytz.timezone ("UTC")
beginning_of_times = datetime(2000, 1, 1, 0, 0, 0, 0, universal)

def get_date(email):
    # extracts date from email and converts it to a feature
    date = datetime.strptime(email['Date'][:-6], '%a, %d %b %Y %H:%M:%S %z')
    date_f = [0] * 100
    delta = beginning_of_times-date.astimezone(universal)
    date_f[0] = delta.total_seconds()
    date_f[1] = delta.days 
    date_f[2] = delta.days // 30
    date_f[3] = delta.days // 365
    
    date_f[10] = date.isoweekday()  
    date_f[11] = date.year
    date_f[12] = date.month
    date_f[13] = date.hour
    
    return date_f

In [10]:
def get_addresses(line):
    # returns email addresses from line
    if line == None or len(line) < 8:
        return []
    addresses = []
    address = re.split(r'[,/<>]+', line)
    for adr in address:
        adr = ''.join([i for i in adr if not i.isspace() and i!= '\''])
        if len(adr) < 8:
            continue
        if adr[0] == '.':
            adr = adr[1:]
        if adr[-1] == '.':
            adr = adr[:-1]
                
        if '@' in adr and '.' in adr:
            addresses.append(adr.lower())
    return addresses

In [11]:
fields = ["To", "From", "Cc", "Bcc", 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Origin',]

## Emails

In [83]:
categories, data = load_emails("enron")
num_classes = len(categories)
labels = []
for i in range(len(categories)):
    labels += [categories[i]] * len(data[i])

In [None]:
extract_text(data, fields)

In [70]:
word2vec = read_embedding("w2v")
adr2vec = read_embedding("adr2v")

In [84]:
# Replace email content with vectors
features_vec = []
for direct in data:
    for email in direct:
        #"""
        seq = []
        em = email.get_payload()
        # Remove all non-alphabetic characters 
        cleared = ''.join([i for i in em if i.isalpha() or i.isspace()])
        
        for word in cleared.split():
            word = word.lower()
            cbow = word2vec.get(word)
            if cbow != None:
                seq += [cbow]

        # add subject
        subj = email["Subject"].lower().split(' ')
        for word in subj:
            cbow = word2vec.get(word)
            if cbow != None:
                seq += [cbow]
        #"""        
        #seq = []
        #"""
        # Add addresses as features
        for field in fields:
            addresses = get_addresses(email[field])
            for adr in addresses:
                cbow = adr2vec.get(adr)
                if cbow != None:
                    seq += [cbow]       
                   
        # Date
        seq.append(get_date(email)) 
        
        features_vec.append(np.array(seq, dtype="float32"))

In [85]:
# Shuffle features and labels
l, f = [], []
index_shuf = list(range(len(labels)))
random.shuffle(index_shuf)
for i in index_shuf:
    l.append(labels[i])
    f.append(features_vec[i])
labels = one_hot_encode(l)
features_vec = f

In [86]:
# Split dataset for training and testing 
n = len(labels)//10*8
labels, test_l = labels[:n], labels[n:]
features_vec, test_v = features_vec[:n], features_vec[n:]

## Articles

In [60]:
# Reads data and preprocesses lables 
num_classes = 20
train, test = load_articles()
random.shuffle(train)
random.shuffle(test)
features, labels = make_dataset(train)
test_f, test_l = make_dataset(test)

labels = one_hot_encode(labels)
test_l = one_hot_encode(test_l)

In [62]:
# Replaces words by their vector representations 
features_vec = []
for feat in features:
    seq = []
    for word in feat:
        seq += [word2vec.get(word, [0] * vector_len)]
    features_vec.append(np.array(seq, dtype="float32"))

test = []
for feat in test_f:
    seq = []
    for word in feat:
        seq += [word2vec.get(word, [0] * vector_len)]
    test.append(np.array(seq, dtype="float32"))

## CNTK 

In [87]:
# Defines the LSTM model for classifying sequences
def LSTM_sequence_classifier_net(input, num_output_classes, embedding_dim,
                                LSTM_dim, cell_dim):
    lstm_classifier = Sequential([Embedding(embedding_dim),
                                  Recurrence(LSTM(LSTM_dim, cell_dim)),
                                  sequence.last,
                                  Dense(num_output_classes)])
    return lstm_classifier(input)

In [88]:
# Creates and trains a LSTM sequence classification model
input_dim = 100
cell_dim = 100
hidden_dim = 100
embedding_dim = 200

num_output_classes = num_classes

# Input variables denoting the features and label data
features = sequence.input_variable(shape=input_dim)
label = input_variable(num_output_classes)

# Instantiate the sequence classification model
classifier_output = LSTM_sequence_classifier_net(
        features, num_output_classes, embedding_dim, hidden_dim, cell_dim)

    
ce = cross_entropy_with_softmax(classifier_output, label)
pe = classification_error(classifier_output, label)
 
lr_per_sample = learning_rate_schedule(0.0005, UnitType.sample)
    
# Instantiate the trainer object to drive the model training
progress_printer = ProgressPrinter(0)
trainer = Trainer(classifier_output, (ce, pe),
                      sgd(classifier_output.parameters, lr=lr_per_sample),
                      progress_printer)

 average      since    average      since      examples
    loss       last     metric       last              
 ------------------------------------------------------


In [90]:
# Get minibatches of sequences to train with and perform model training
minibatch_size = 200
for j in range(50):
    i = 0
    while (i + minibatch_size) < len(labels):
        input_map = {
            features : features_vec[i : i + minibatch_size], 
            label : labels[i : i + minibatch_size]
        }
        trainer.train_minibatch(input_map)
        i += minibatch_size

evaluation_average = float(trainer.previous_minibatch_evaluation_average)
loss_average = float(trainer.previous_minibatch_loss_average)

     1.32       1.62      0.454        0.6        204600


In [91]:
# Test the model
n = len(test_v)
e = 0
for i in range(n):
    l = np.argmax(test_l[i])
    ar = classifier_output.eval(test_v[i])[0]
    p = np.argmax(ar)
    if p != l:
        e += 1
print(e/n)

0.5930232558139535


In [76]:
#trainer.save_checkpoint('saved_trainers/nn.trainer_emails_text') 

In [23]:
print(len(labels), len(data))

2112 13


In [79]:
#trainer.restore_from_checkpoint('saved_trainers/nn.trainer_emails_text')

{}