In [1]:
import sys
import os
import numpy as np
from cntk import Trainer, Axis
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs,\
        INFINITELY_REPEAT
from cntk.learners import sgd, learning_rate_schedule, UnitType
from cntk import input_variable, cross_entropy_with_softmax, \
        classification_error, sequence
from cntk.logging import ProgressPrinter
from cntk.layers import Sequential, Embedding, Recurrence, LSTM, Dense
from email.parser import Parser
import random

In [2]:
def read_w3():
    categories = os.listdir("williams-w3")

    for c in categories:
        if c[0] == '.':
            categories.remove(c)

    emails = []
    for direc in categories:
        files = os.listdir("williams-w3/" + direc)
        f = []
        for file in files:
            file = open("williams-w3/" + direc + "/" + file)
            parser = Parser()
            email = parser.parse(file)
            f.append(email)
        emails.append(f)
    return categories, emails

In [53]:
def extract_text(data):
    # Extract plain text for CBOW
    file = open("extracted_text_cl", 'w')
    for clas in data:
        for email in clas:
            text = email.get_payload()
            lines = text.split('\n')
            for line in lines:
                if len(line) > 1:
                    if line[0] != '>':
                        clear = ''.join([i for i in line if i.isalpha() or i.isspace()])
                        file.write(clear.lower()) 

In [54]:
extract_text([[data[1][3]]])

In [4]:
def read_data():
    train_p = "data/20ng-train-all-terms.txt"
    test_p = "data/20ng-test-all-terms.txt"

    f_train = open(train_p)
    f_test = open(test_p)
    
    train = f_train.readlines()
    test = f_test.readlines()
    
    f_train.close()
    f_test.close()
    return train, test

In [5]:
def make_dataset(data):
    n = 100
    labels = []
    features = []
    
    for n, line in enumerate(data, 1):
        l = line.partition('\t')
        labels.append(l[0])
        features.append(l[2].split(' '))
    return features, labels

In [6]:
def one_hot_encode(labels):
    #Reads labels and one hot encode them
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    labels_arr = le.fit_transform(labels)

    labels = np.zeros((len(labels_arr), num_classes), dtype="float32")
    for i in range(len(labels_arr)):
        labels[i][labels_arr[i]] = 1
    return labels

In [58]:
# Reads CBOW embeding file and converts it to a dictionary 
path_w2v = "w2v_3"
file_w2v = open(path_w2v)
lines = file_w2v.readlines()
word2vec = {}
dictionary, vector_len = lines[:-1].pop(0).split(' ')
dictionary_len = int(dictionary)
vector_len = int(vector_len)
for l in lines:
    arr = l[:-1].split(' ')
    w = arr[0]
    arr = arr[1:] #list(map(float, arr[1:]))
    word2vec[w] = arr

## Emails

In [60]:
categories, data = read_w3()
#extract_text(data)
num_classes = len(categories)

In [61]:
labels = []
for i in range(len(categories)):
    labels += [categories[i]] * len(data[i])

In [62]:
# Replace email content with vectors
features_vec = []
for direct in data:
    for email in direct:
        seq = []
        em = email.get_payload()
        cleared = ''.join([i for i in em if i.isalpha() or i.isspace()])
        for word in cleared.split():
            cbow = word2vec.get(word)
            word = word.lower()
            if cbow != None:
                seq += [cbow]
        if seq == []:
            seq = [[0] * 100]
        features_vec.append(np.array(seq, dtype="float32"))

In [63]:
# Shuffle features and labels
l, f = [], []
index_shuf = list(range(len(labels)))
random.shuffle(index_shuf)
for i in index_shuf:
    l.append(labels[i])
    f.append(features_vec[i])
labels = one_hot_encode(l)
features_vec = f

## articles

In [122]:
num_classes = 20
train, test = read_data()
random.shuffle(train)
random.shuffle(test)
features, labels = make_dataset(train)
test_f, test_l = make_dataset(test)

labels = one_hot_encode(labels)
test_l = one_hot_encode(test_l)

In [123]:
# Replaces words by their vector representations 
features_vec = []
for feat in features:
    seq = []
    for word in feat:
        seq += [word2vec.get(word, [0] * vector_len)]
    features_vec.append(np.array(seq, dtype="float32"))

In [121]:
test = []
for feat in test_f:
    seq = []
    for word in feat:
        seq += [word2vec.get(word, [0] * vector_len)]
    test.append(np.array(seq, dtype="float32"))

KeyboardInterrupt: 

## CNTK 

In [64]:
# Defines the LSTM model for classifying sequences
def LSTM_sequence_classifier_net(input, num_output_classes, embedding_dim,
                                LSTM_dim, cell_dim):
    lstm_classifier = Sequential([Embedding(embedding_dim),
                                  Recurrence(LSTM(LSTM_dim, cell_dim)),
                                  sequence.last,
                                  Dense(num_output_classes)])
    return lstm_classifier(input)

In [68]:
# Creates and trains a LSTM sequence classification model
input_dim = 100
cell_dim = 100
hidden_dim = 100
embedding_dim = 200

num_output_classes = num_classes

# Input variables denoting the features and label data
features = sequence.input_variable(shape=input_dim)
label = input_variable(num_output_classes)

# Instantiate the sequence classification model
classifier_output = LSTM_sequence_classifier_net(
        features, num_output_classes, embedding_dim, hidden_dim, cell_dim)

    
ce = cross_entropy_with_softmax(classifier_output, label)
pe = classification_error(classifier_output, label)
 
lr_per_sample = learning_rate_schedule(0.001, UnitType.sample)
    
# Instantiate the trainer object to drive the model training
progress_printer = ProgressPrinter(0)
trainer = Trainer(classifier_output, (ce, pe),
                      sgd(classifier_output.parameters, lr=lr_per_sample),
                      progress_printer)



 average      since    average      since      examples
    loss       last     metric       last              
 ------------------------------------------------------


In [73]:
# Get minibatches of sequences to train with and perform model training
minibatch_size = 200
for j in range(50):
    i = 0
    while (i + minibatch_size) < len(labels):
        input_map = {
            features : features_vec[i : i + minibatch_size], 
            label : labels[i : i + minibatch_size]
        }
        trainer.train_minibatch(input_map)
        i += minibatch_size

evaluation_average = float(trainer.previous_minibatch_evaluation_average)
loss_average = float(trainer.previous_minibatch_loss_average)

      0.3      0.179     0.0797     0.0522        409400


In [None]:
n = len(test)
e = 0
for i in range(n//10):
    l = np.argmax(test_l[i])
    ar = classifier_output.eval(test[i])[0]
    p = np.argmax(ar)
    #print(l, p)
    #print(l, p, ar)
    if p != l:
        e += 1
print(e/n)#/len(labels))

In [None]:
#trainer.train_minibatch({features: features_vec[:10], label: labels[:10]})

In [67]:
#trainer.save_checkpoint('nn.trainer_emails_2') 

In [69]:
trainer.restore_from_checkpoint('nn.trainer_emails_2')

{}

In [59]:
key = list(word2vec.keys())
key

['communications',
 'competitive',
 'would',
 'isnt',
 'doing',
 'perform',
 'motley',
 'announce',
 'operation',
 'wrong',
 'scrimmage',
 'activity',
 'several',
 'corporate',
 'schedulestxterror',
 'working',
 'am',
 'serena',
 'agreeto',
 'recommendations',
 'cannot',
 'surprise',
 'girl',
 'welcome',
 'average',
 'finished',
 'behavior',
 'willbe',
 'motivated',
 'caroline',
 'therest',
 'absence',
 'subjected',
 'buys',
 'lobby',
 'cdtto',
 'theintended',
 'loon',
 'negotiation',
 'sarah',
 'brown',
 'megawatts',
 'screen',
 'valuable',
 'action',
 'dispatch',
 'brings',
 'liz',
 'drinks',
 'monitoring',
 'gregsubject',
 'labdate',
 'different',
 'mar',
 'interruptions',
 'location',
 'holiday',
 'zeroed',
 'apologize',
 'picked',
 'kimberly',
 'yes',
 'indicate',
 'gang',
 'up',
 'since',
 'diana',
 'brief',
 'largely',
 'gracesubject',
 'iso',
 'scope',
 'sept',
 'proud',
 'gmc',
 'discussion',
 'dayahead',
 'actually',
 'jan',
 'generators',
 'billi',
 'laura',
 'insights',
 'p

In [None]:
print(evaluation_average, loss_average)

In [44]:
classifier_output.eval(features_vec[2])

array([[  1.20011242e-02,   1.70068850e-03,  -5.80662638e-02,
          1.85028054e-02,  -1.82034522e-02,   2.89715324e-02,
          5.20967022e-02,  -2.37078443e-02,  -7.60851009e-03,
          1.17672898e-01,   1.14210136e-02,  -1.70385838e-02,
         -2.39737006e-03,   5.10595366e-02,   2.33134385e-02,
          5.56765730e-03,   1.03521277e-04,  -4.58391495e-02,
          3.86228971e-02,   4.54982556e-02]], dtype=float32)

In [124]:

labels.shape

(11293, 20)

In [125]:
features_vec[4].shape

(43, 100)

In [126]:
labels[:10]

array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  

In [134]:
labels[:10]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,

In [127]:
features_vec[:4]

[array([[-0.0313445 ,  0.0363934 ,  0.01575427, ..., -0.07034989,
          0.03551885,  0.09688685],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [ 0.15445308, -0.01056654, -0.03674499, ...,  0.21335812,
         -0.03393506, -0.10598069],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]], dtype=float32),
 array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.0117572 ,  0.00594067, -0.00415221, ...,  0.01928018,
         -0.00728491, -0.02341985],
        ..., 
        [ 0.02011477,  0.02140048,  0

In [132]:
features_vec[:4]

[array([[ 0.02879665,  0.00533477, -0.00380551, ...,  0.04646795,
         -0.00823378, -0.04457391],
        [ 0.01719734,  0.01402029,  0.00318696, ...,  0.03257063,
         -0.02749901, -0.0530865 ],
        [ 0.02879665,  0.00533477, -0.00380551, ...,  0.04646795,
         -0.00823378, -0.04457391],
        ..., 
        [ 0.00612331,  0.01053068,  0.00769262, ...,  0.01508476,
         -0.0074876 , -0.02228216],
        [ 0.03669428,  0.01301391, -0.00050696, ...,  0.08865716,
         -0.02355903, -0.10697225],
        [-0.00568012,  0.00588597,  0.00108825, ...,  0.00224834,
         -0.00604241, -0.00475143]], dtype=float32),
 array([[  2.14639366e-01,   3.11134234e-02,   6.25165999e-02, ...,
           1.10486603e+00,  -4.75843906e-01,  -6.72918379e-01],
        [ -7.70202041e-01,  -8.02855670e-01,   6.06540859e-01, ...,
           2.25412631e+00,  -1.47566736e-01,  -8.22424650e-01],
        [  6.12330716e-03,   1.05306841e-02,   7.69261550e-03, ...,
           1.50847556e-02