In [2]:
import sys
import os
import numpy as np
from cntk import Trainer, Axis
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs,\
        INFINITELY_REPEAT
from cntk.learners import sgd, learning_rate_schedule, UnitType
from cntk import input_variable, cross_entropy_with_softmax, \
        classification_error, sequence
from cntk.logging import ProgressPrinter
from cntk.layers import Sequential, Embedding, Recurrence, LSTM, Dense

In [3]:
num_classes = 20

In [4]:
def read_data():
    train_p = "data/20ng-train-all-terms.txt"
    test_p = "data/20ng-test-all-terms.txt"

    f_train = open(train_p)
    f_test = open(test_p)
    
    train = f_train.readlines()
    test = f_test.readlines()
    
    f_train.close()
    f_test.close()
    return train, test

In [5]:
def make_dataset(train, test):
    labels = []
    features = []
    for n, line in enumerate(train, 1):
        l = line.partition('\t')
        labels.append(l[0])
        features.append(l[2].split(' '))
    return features, labels

    test_y = []
    test_x = []
    for n, line in enumerate(test, 1):
        l = line.partition('\t')
        test_y.append(l[0])
        test_x.append(l[2].split(' '))
    return features, label

In [None]:
train, test = read_data()
features, label_names = make_dataset(train, test)

#Reads labels and one hot encode them
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(label_names)
labels_arr = le.fit_transform(label_names)
labels = labels_arr

labels = np.zeros((len(labels_arr), num_classes))
for i in range(len(labels_arr)):
    labels[i][labels_arr[i]] = 1

In [17]:
# Reads CBOW embeding file and converts it to a dictionary 
path_w2v = "word2vect_dict"
file_w2v = open(path_w2v)
lines = file_w2v.readlines()
word2vec = {}
dictionary, vector_len = lines[:-1].pop(0).split(' ')
dictionary_len = int(dictionary)
vector_len = int(vector_len)
for l in lines:
    arr = l[:-1].split(' ')
    w = arr[0]
    arr = list(map(float, arr[1:]))
    word2vec[w] = arr

In [18]:
# Replaces words by their vector representations 
features_vec = []
for feat in features:
    seq = []
    for word in feat:
        seq += [word2vec.get(word, [0] * vector_len)]
    features_vec.append(np.array(seq, dtype="float32"))

In [19]:
# Defines the LSTM model for classifying sequences
def LSTM_sequence_classifier_net(input, num_output_classes, embedding_dim,
                                LSTM_dim, cell_dim):
    lstm_classifier = Sequential([Embedding(embedding_dim),
                                  Recurrence(LSTM(LSTM_dim, cell_dim)),
                                  sequence.last,
                                  Dense(num_output_classes)])
    return lstm_classifier(input)

In [22]:
# Creates and trains a LSTM sequence classification model
input_dim = 100
cell_dim = 100
hidden_dim = 100
embedding_dim = 200

num_output_classes = 20

# Input variables denoting the features and label data
features = sequence.input_variable(shape=input_dim)
label = input_variable(num_output_classes)

# Instantiate the sequence classification model
classifier_output = LSTM_sequence_classifier_net(
        features, num_output_classes, embedding_dim, hidden_dim, cell_dim)

    
ce = cross_entropy_with_softmax(classifier_output, label)
pe = classification_error(classifier_output, label)
 
input_map = {
        features: features_vec,
        label:    labels
}    
    
lr_per_sample = learning_rate_schedule(0.0005, UnitType.sample)
    
# Instantiate the trainer object to drive the model training
progress_printer = ProgressPrinter(0)
trainer = Trainer(classifier_output, (ce, pe),
                      sgd(classifier_output.parameters, lr=lr_per_sample),
                      progress_printer)

# Get minibatches of sequences to train with and perform model training
minibatch_size = 200


for i in range(0, minibatch_size*10, minibatch_size):
    trainer.train_minibatch({features : features_vec[i:i+minibatch_size], label : labels[i:i+minibatch_size]})

    #trainer.train_minibatch(input_map)

evaluation_average = float(trainer.previous_minibatch_evaluation_average)
loss_average = float(trainer.previous_minibatch_loss_average)


 average      since    average      since      examples
    loss       last     metric       last              
 ------------------------------------------------------


  (sample.dtype, var.uid, str(var.dtype)))


Learning rate per sample: 0.0005
      2.9        2.9      0.805      0.805           200
     2.63        2.5      0.482       0.32           600
     2.53       2.46      0.701      0.865          1400


In [24]:
print(evaluation_average, loss_average)

1.0 2.2248074340820314


In [25]:
i = 1000
trainer.test_minibatch({features : features_vec[i:i+minibatch_size], label : labels[i:i+minibatch_size]})

  (sample.dtype, var.uid, str(var.dtype)))


1.0

In [None]:
trainer.

In [4]:
def create_reader(path, is_training, input_dim, label_dim):
    return MinibatchSource(CTFDeserializer(path, StreamDefs(
        features=StreamDef(field='x', shape=input_dim, is_sparse=True),
        labels=StreamDef(field='y', shape=label_dim, is_sparse=False)
        )), randomize=is_training,
        max_sweeps=INFINITELY_REPEAT if is_training else 1)