In [1]:
import sys
import os
import numpy as np
from cntk import Trainer, Axis
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs,\
        INFINITELY_REPEAT
from cntk.learners import sgd, learning_rate_schedule, UnitType
from cntk import input_variable, cross_entropy_with_softmax, \
        classification_error, sequence
from cntk.logging import ProgressPrinter
from cntk.layers import Sequential, Embedding, Recurrence, LSTM, Dense
from email.parser import Parser
import random

In [2]:
def read_w3():
    categories = os.listdir("williams-w3")

    for c in categories:
        if c[0] == '.':
            categories.remove(c)

    emails = []
    for direc in categories:
        files = os.listdir("williams-w3/" + direc)
        f = []
        for file in files:
            file = open("williams-w3/" + direc + "/" + file)
            parser = Parser()
            email = parser.parse(file)
            f.append(email)
        emails.append(f)
    return categories, emails

In [3]:
def extract_text(data):
    # Extract plain text for CBOW
    file = open("extracted_text", 'w')
    for clas in data:
        for email in clas:
            em = email.get_payload()
            file.write(em) 

In [4]:
def read_data():
    train_p = "data/20ng-train-all-terms.txt"
    test_p = "data/20ng-test-all-terms.txt"

    f_train = open(train_p)
    f_test = open(test_p)
    
    train = f_train.readlines()
    test = f_test.readlines()
    
    f_train.close()
    f_test.close()
    return train, test

In [5]:
def make_dataset(data):
    n = 100
    labels = []
    features = []
    
    for n, line in enumerate(data, 1):
        l = line.partition('\t')
        labels.append(l[0])
        features.append(l[2].split(' '))
    return features, labels

In [6]:
def one_hot_encode(labels):
    #Reads labels and one hot encode them
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    labels_arr = le.fit_transform(labels)

    labels = np.zeros((len(labels_arr), num_classes), dtype="float32")
    for i in range(len(labels_arr)):
        labels[i][labels_arr[i]] = 1
    return labels

In [22]:
# Reads CBOW embeding file and converts it to a dictionary 
path_w2v = "w2v_2"
file_w2v = open(path_w2v)
lines = file_w2v.readlines()
word2vec = {}
dictionary, vector_len = lines[:-1].pop(0).split(' ')
dictionary_len = int(dictionary)
vector_len = int(vector_len)
for l in lines:
    arr = l[:-1].split(' ')
    w = arr[0]
    arr = arr[1:] #list(map(float, arr[1:]))
    word2vec[w] = arr

## Emails

In [50]:
categories, data = read_w3()
extract_text(data)
num_classes = len(categories)

In [51]:
labels = []
for i in range(len(categories)):
    labels += [categories[i]] * len(data[i])
labels = one_hot_encode(labels)

In [52]:
features_vec = []
for direct in data:
    for email in direct:
        seq = []
        for word in email.get_payload():
            cbow = word2vec.get(word)
            if cbow != None:
                seq += [cbow]
        features_vec.append(np.array(seq, dtype="float32"))

In [53]:
# Shuffle features and labels
combined = list(zip(features_vec, labels))
random.shuffle(combined)
features_vec, labels = zip(*combined)

## articles

In [39]:
num_classes = 20
train, test = read_data()
random.shuffle(train)
random.shuffle(test)
features, labels = make_dataset(train)
test_f, test_l = make_dataset(test)

labels = one_hot_encode(labels)
test_l = one_hot_encode(test_l)

In [40]:
# Replaces words by their vector representations 
features_vec = []
for feat in features:
    seq = []
    for word in feat:
        seq += [word2vec.get(word, [0] * vector_len)]
    features_vec.append(np.array(seq, dtype="float32"))

In [26]:
test = []
for feat in test_f:
    seq = []
    for word in feat:
        seq += [word2vec.get(word, [0] * vector_len)]
    test.append(np.array(seq, dtype="float32"))

In [54]:
# Defines the LSTM model for classifying sequences
def LSTM_sequence_classifier_net(input, num_output_classes, embedding_dim,
                                LSTM_dim, cell_dim):
    lstm_classifier = Sequential([Embedding(embedding_dim),
                                  Recurrence(LSTM(LSTM_dim, cell_dim)),
                                  sequence.last,
                                  Dense(num_output_classes)])
    return lstm_classifier(input)

In [55]:
# Creates and trains a LSTM sequence classification model
input_dim = 100
cell_dim = 100
hidden_dim = 100
embedding_dim = 200

num_output_classes = num_classes

# Input variables denoting the features and label data
features = sequence.input_variable(shape=input_dim)
label = input_variable(num_output_classes)

# Instantiate the sequence classification model
classifier_output = LSTM_sequence_classifier_net(
        features, num_output_classes, embedding_dim, hidden_dim, cell_dim)

    
ce = cross_entropy_with_softmax(classifier_output, label)
pe = classification_error(classifier_output, label)
 
lr_per_sample = learning_rate_schedule(0.01, UnitType.sample)
    
# Instantiate the trainer object to drive the model training
progress_printer = ProgressPrinter(0)
trainer = Trainer(classifier_output, (ce, pe),
                      sgd(classifier_output.parameters, lr=lr_per_sample),
                      progress_printer)

 average      since    average      since      examples
    loss       last     metric       last              
 ------------------------------------------------------


In [58]:
# Get minibatches of sequences to train with and perform model training
minibatch_size = 2
for j in range(50):
    i = 0
    while (i + minibatch_size) < len(labels):
        input_map = {
            features : features_vec[i : i + minibatch_size], 
            label : labels[i : i + minibatch_size]
        }
        trainer.train_minibatch(input_map)
        i += minibatch_size

evaluation_average = float(trainer.previous_minibatch_evaluation_average)
loss_average = float(trainer.previous_minibatch_loss_average)

ValueError: if you specify sequence begin markers, it needs to be a list

In [96]:
n = len(test)
e = 0
for i in range(n//10):
    l = np.argmax(test_l[i])
    ar = classifier_output.eval(test[i])[0]
    p = np.argmax(ar)
    #print(l, p)
    #print(l, p, ar)
    if p != l:
        e += 1
print(e/n)#/len(labels))

0.09086078639744952


In [80]:
#trainer.save_checkpoint('nn.trainer_2') #50 50 100

In [95]:
#trainer.save_checkpoint('nn.trainer_3') #100 100 200 = 20

In [None]:
trainer.restore_from_checkpoint('nn.trainer_3')

In [None]:
trainer

In [None]:
print(evaluation_average, loss_average)

In [44]:
classifier_output.eval(features_vec[2])

array([[  1.20011242e-02,   1.70068850e-03,  -5.80662638e-02,
          1.85028054e-02,  -1.82034522e-02,   2.89715324e-02,
          5.20967022e-02,  -2.37078443e-02,  -7.60851009e-03,
          1.17672898e-01,   1.14210136e-02,  -1.70385838e-02,
         -2.39737006e-03,   5.10595366e-02,   2.33134385e-02,
          5.56765730e-03,   1.03521277e-04,  -4.58391495e-02,
          3.86228971e-02,   4.54982556e-02]], dtype=float32)

In [60]:
labels[0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.], dtype=float32)

In [59]:
features_vec[8]

array([[  1.02802059e-02,   1.12944457e-03,  -6.75620162e-04, ...,
          5.55906864e-03,   7.63915386e-03,   1.21494848e-02],
       [  1.17571997e-02,   5.94067480e-03,  -4.15221462e-03, ...,
          1.92801822e-02,  -7.28491228e-03,  -2.34198477e-02],
       [ -7.70202041e-01,  -8.02855670e-01,   6.06540859e-01, ...,
          2.25412631e+00,  -1.47566736e-01,  -8.22424650e-01],
       ..., 
       [  1.71973370e-02,   1.40202874e-02,   3.18695768e-03, ...,
          3.25706266e-02,  -2.74990071e-02,  -5.30865043e-02],
       [  6.12330716e-03,   1.05306841e-02,   7.69261550e-03, ...,
          1.50847556e-02,  -7.48760300e-03,  -2.22821608e-02],
       [  1.40346721e-01,   3.81773524e-02,  -5.00198156e-02, ...,
          1.59575596e-01,   7.95756374e-03,  -8.64180103e-02]], dtype=float32)