In [60]:
import sys
import os
import numpy as np
import random
from cntk import Trainer, Axis
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs,\
        INFINITELY_REPEAT
from cntk.learners import sgd, learning_rate_schedule, UnitType
from cntk import input_variable, cross_entropy_with_softmax, \
        classification_error, sequence
from cntk.logging import ProgressPrinter
from cntk.layers import Sequential, Embedding, Recurrence, LSTM, Dense
from sklearn import preprocessing
from email.parser import Parser

In [2]:
def read_w3():
    # Reads williams-w3 dataset
    categories = os.listdir("williams-w3")
    for c in categories:
        if c[0] == '.':
            categories.remove(c)
    emails = []
    for direc in categories:
        files = os.listdir("williams-w3/" + direc)
        f = []
        for file in files:
            file = open("williams-w3/" + direc + "/" + file)
            parser = Parser()
            email = parser.parse(file)
            f.append(email)
        emails.append(f)
    return categories, emails

In [3]:
def extract_text(data):
    # Extract plain text from all the emails and store it to a text file for CBOW
    file = open("extracted_text_cl", 'w')
    for clas in data:
        for email in clas:
            text = email.get_payload()
            lines = text.split('\n')
            for line in lines:
                if len(line) > 1:
                    if line[0] != '>':
                        clear = ''.join([i for i in line if i.isalpha() or i.isspace()])
                        file.write(clear.lower()) 

In [5]:
def read_data_articles():
    #Reads articles data
    train_p = "data/20ng-train-all-terms.txt"
    test_p = "data/20ng-test-all-terms.txt"

    f_train = open(train_p)
    f_test = open(test_p)
    
    train = f_train.readlines()
    test = f_test.readlines()
    
    f_train.close()
    f_test.close()
    return train, test

In [6]:
def make_dataset(data):
    # Process articles 
    labels = []
    features = []
    
    for n, line in enumerate(data, 1):
        l = line.partition('\t')
        labels.append(l[0])
        features.append(l[2].split(' '))
    return features, labels

In [7]:
def one_hot_encode(labels):
    #Reads labels and one hot encode them
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    labels_arr = le.fit_transform(labels)
    
    labels = np.zeros((len(labels_arr), num_classes), dtype="float32")
    for i in range(len(labels_arr)):
        labels[i][labels_arr[i]] = 1
    return labels

In [8]:
# Reads CBOW embeding file and converts it to a dictionary 
path_w2v = "w2v_3"
file_w2v = open(path_w2v)
lines = file_w2v.readlines()
word2vec = {}
dictionary, vector_len = lines[:-1].pop(0).split(' ')
dictionary_len = int(dictionary)
vector_len = int(vector_len)
for l in lines:
    arr = l[:-1].split(' ')
    w = arr[0]
    arr = arr[1:] #list(map(float, arr[1:]))
    word2vec[w] = arr

## Emails

In [168]:
categories, data = read_w3()
#extract_text(data)
num_classes = len(categories)

In [169]:
labels = []
for i in range(len(categories)):
    labels += [categories[i]] * len(data[i])

In [170]:
# Replace email content with vectors
features_vec = []
for direct in data:
    for email in direct:
        seq = []
        em = email.get_payload()
        # Remove all non-alphabetic characters 
        cleared = ''.join([i for i in em if i.isalpha() or i.isspace()])
        
        for word in cleared.split():
            word = word.lower()
            cbow = word2vec.get(word)
            if cbow != None:
                seq += [cbow]
                
        # add subject
        subj = word2vec.get(email["Subject"].lower())
        if subj != None:
            seq += [subj]
            
        if seq == []:
            seq = [[0] * 100]
        features_vec.append(np.array(seq, dtype="float32"))

In [171]:
# Add aditional features
from_l = {}
to_l = {}
from_i = 0
to_i = 0
em = 0
for dirc in data:
    for email in dirc:
        who = email['From']
        to = email['To']
        if who not in from_l:
            from_l[who] = from_i
            from_i += 1
        if to not in to_l:
            to_l[to] = to_i
            to_i += 1
        vect1 = np.zeros((100, from_i//100 + 1), dtype="float32")
        vect1[from_l[who]%100][from_l[who]//100] = 1
        vect2 = np.zeros((100, to_i//100 + 1), dtype="float32")
        vect2[to_l[to]%100][to_l[to]//100] = 1
        np.hstack((vect1, vect2))
        features_vec[em] = np.vstack((vect.T, features_vec[em]))
        em += 1
        

#len(from_l.keys())

In [139]:
data[0][0].keys()

['Message-ID',
 'Date',
 'From',
 'To',
 'Subject',
 'Cc',
 'Mime-Version',
 'Content-Type',
 'Content-Transfer-Encoding',
 'Bcc',
 'X-From',
 'X-To',
 'X-cc',
 'X-bcc',
 'X-Origin',
 'X-FileName']

In [172]:
# Shuffle features and labels
l, f = [], []
index_shuf = list(range(len(labels)))
random.shuffle(index_shuf)
for i in index_shuf:
    l.append(labels[i])
    f.append(features_vec[i])
labels = one_hot_encode(l)
features_vec = f

In [173]:
# Split dataset for training and testing 
n = len(labels)//10*9
labels, test_l = labels[:n], labels[n:]
features_vec, test_v = features_vec[:n], features_vec[n:]

## Articles

In [122]:
num_classes = 20
train, test = read_data_articles()
random.shuffle(train)
random.shuffle(test)
features, labels = make_dataset(train)
test_f, test_l = make_dataset(test)

labels = one_hot_encode(labels)
test_l = one_hot_encode(test_l)

In [123]:
# Replaces words by their vector representations 
features_vec = []
for feat in features:
    seq = []
    for word in feat:
        seq += [word2vec.get(word, [0] * vector_len)]
    features_vec.append(np.array(seq, dtype="float32"))

In [121]:
test = []
for feat in test_f:
    seq = []
    for word in feat:
        seq += [word2vec.get(word, [0] * vector_len)]
    test.append(np.array(seq, dtype="float32"))

KeyboardInterrupt: 

## CNTK 

In [174]:
# Defines the LSTM model for classifying sequences
def LSTM_sequence_classifier_net(input, num_output_classes, embedding_dim,
                                LSTM_dim, cell_dim):
    lstm_classifier = Sequential([Embedding(embedding_dim),
                                  Recurrence(LSTM(LSTM_dim, cell_dim)),
                                  sequence.last,
                                  Dense(num_output_classes)])
    return lstm_classifier(input)

In [175]:
# Creates and trains a LSTM sequence classification model
input_dim = 100
cell_dim = 100
hidden_dim = 100
embedding_dim = 200

num_output_classes = num_classes

# Input variables denoting the features and label data
features = sequence.input_variable(shape=input_dim)
label = input_variable(num_output_classes)

# Instantiate the sequence classification model
classifier_output = LSTM_sequence_classifier_net(
        features, num_output_classes, embedding_dim, hidden_dim, cell_dim)

    
ce = cross_entropy_with_softmax(classifier_output, label)
pe = classification_error(classifier_output, label)
 
lr_per_sample = learning_rate_schedule(0.001, UnitType.sample)
    
# Instantiate the trainer object to drive the model training
progress_printer = ProgressPrinter(0)
trainer = Trainer(classifier_output, (ce, pe),
                      sgd(classifier_output.parameters, lr=lr_per_sample),
                      progress_printer)



 average      since    average      since      examples
    loss       last     metric       last              
 ------------------------------------------------------


In [None]:
# Get minibatches of sequences to train with and perform model training
minibatch_size = 200
for j in range(50):
    i = 0
    while (i + minibatch_size) < len(labels):
        input_map = {
            features : features_vec[i : i + minibatch_size], 
            label : labels[i : i + minibatch_size]
        }
        trainer.train_minibatch(input_map)
        i += minibatch_size

evaluation_average = float(trainer.previous_minibatch_evaluation_average)
loss_average = float(trainer.previous_minibatch_loss_average)

In [None]:
# Test the model
n = len(test_v)
e = 0
for i in range(n):
    l = np.argmax(test_l[i])
    ar = classifier_output.eval(test_v[i])[0]
    p = np.argmax(ar)
    if p != l:
        e += 1
print(e/n)

In [67]:
#trainer.save_checkpoint('nn.trainer_emails_2') 

In [69]:
#trainer.restore_from_checkpoint('nn.trainer_emails_2')

{}