In [19]:
import sys
import os
import numpy as np
from cntk import Trainer, Axis
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs,\
        INFINITELY_REPEAT
from cntk.learners import sgd, learning_rate_schedule, UnitType
from cntk import input_variable, cross_entropy_with_softmax, \
        classification_error, sequence
from cntk.logging import ProgressPrinter
from cntk.layers import Sequential, Embedding, Recurrence, LSTM, Dense

In [2]:
def read_data():
    train_p = "data/20ng-train-all-terms.txt"
    test_p = "data/20ng-test-all-terms.txt"

    f_train = open(train_p)
    f_test = open(test_p)
    
    train = f_train.readlines()
    test = f_test.readlines()
    
    f_train.close()
    f_test.close()
    return train, test

In [29]:
train, test = read_data()
features, label = make_dataset(train, test)
label = np.array(label)
features = np.array(features)

In [24]:
def make_dataset(train, test):
    label = []
    features = []
    for n, line in enumerate(train, 1):
        l = line.partition('\t')
        label.append(l[0])
        features.append(l[2].split(' '))
    test_y = []
    test_x = []
    for n, line in enumerate(test, 1):
        l = line.partition('\t')
        test_y.append(l[0])
        test_x.append(l[2].split(' '))
    return features, label

In [30]:
label[0]

'alt.atheism'

In [4]:
def create_reader(path, is_training, input_dim, label_dim):
    return MinibatchSource(CTFDeserializer(path, StreamDefs(
        features=StreamDef(field='x', shape=input_dim, is_sparse=True),
        labels=StreamDef(field='y', shape=label_dim, is_sparse=False)
        )), randomize=is_training,
        max_sweeps=INFINITELY_REPEAT if is_training else 1)

In [5]:
# Defines the LSTM model for classifying sequences
def LSTM_sequence_classifier_net(input, num_output_classes, embedding_dim,
                                LSTM_dim, cell_dim):
    lstm_classifier = Sequential([Embedding(embedding_dim),
                                  Recurrence(LSTM(LSTM_dim, cell_dim)),
                                  sequence.last,
                                  Dense(num_output_classes)])
    return lstm_classifier(input)

In [35]:
# Creates and trains a LSTM sequence classification model
def train_sequence_classifier():
    input_dim = 2000
    cell_dim = 25
    hidden_dim = 25
    embedding_dim = 50
    num_output_classes = 20

    # Input variables denoting the features and label data
    features = sequence.input_variable(shape=input_dim, is_sparse=True)
    label = input_variable(num_output_classes)
    
    # Instantiate the sequence classification model
    classifier_output = LSTM_sequence_classifier_net(
        features, num_output_classes, embedding_dim, hidden_dim, cell_dim)

    ce = cross_entropy_with_softmax(classifier_output, label)
    pe = classification_error(classifier_output, label)

    rel_path = ("../../../Tests/EndToEndTests/Text/" +
                "SequenceClassification/Data/Train.ctf")
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)

    reader = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
            features: reader.streams.features,
            label:    reader.streams.labels
    }

    lr_per_sample = learning_rate_schedule(0.0005, UnitType.sample)
    # Instantiate the trainer object to drive the model training
    progress_printer = ProgressPrinter(0)
    trainer = Trainer(classifier_output, (ce, pe),
                      sgd(classifier_output.parameters, lr=lr_per_sample),
                      progress_printer)

    # Get minibatches of sequences to train with and perform model training
    minibatch_size = 200

    for i in range(255):
        mb = reader.next_minibatch(minibatch_size, input_map=input_map)
        trainer.train_minibatch(mb)

    evaluation_average = float(trainer.previous_minibatch_evaluation_average)
    loss_average = float(trainer.previous_minibatch_loss_average)
    return evaluation_average, loss_average

In [37]:
__file__ = "data/20ng-test-all-terms.txt"

In [38]:
error, _ = train_sequence_classifier()

<class 'cntk.variables.Variable'>


RuntimeError: error opening file '/home/vlad/work/text_clas/data/../../../Tests/EndToEndTests/Text/SequenceClassification/Data/Train.ctf': No such file or directory

[CALL STACK]
[0x7fdafbdb7a5c]                                                       + 0x52ba5c
[0x7fdafb063c1b]    fopenOrDie  (std::basic_string<wchar_t,std::char_traits<wchar_t>,std::allocator<wchar_t>> const&,  wchar_t const*) + 0x22b
[0x7fda401f3795]    Microsoft::MSR::CNTK::TextParser<float>::Initialize()::{lambda()#1}::  operator()  () const + 0x3b5
[0x7fda401f3953]    Microsoft::MSR::CNTK::TextParser<float>::  Initialize  () + 0x43
[0x7fda401fd49b]    Microsoft::MSR::CNTK::TextParser<float>::  TextParser  (std::shared_ptr<Microsoft::MSR::CNTK::CorpusDescriptor>,  Microsoft::MSR::CNTK::TextConfigHelper const&,  bool) + 0x8b
[0x7fda401eac2e]    CreateDeserializer                                 + 0x1be
[0x7fda4042a10b]    Microsoft::MSR::CNTK::CompositeDataReader::  CreateDeserializer  (Microsoft::MSR::CNTK::ConfigParameters const&,  bool) + 0x1cb
[0x7fda4042a53e]    Microsoft::MSR::CNTK::CompositeDataReader::  CreateDeserializers  (Microsoft::MSR::CNTK::ConfigParameters const&) + 0x25e
[0x7fda4042aece]    Microsoft::MSR::CNTK::CompositeDataReader::  CompositeDataReader  (Microsoft::MSR::CNTK::ConfigParameters const&) + 0x79e
[0x7fda40434f06]    CreateCompositeDataReader                          + 0x26
[0x7fdafbe12959]    CNTK::CompositeMinibatchSource::  CompositeMinibatchSource  (CNTK::MinibatchSourceConfig const&) + 0x629
[0x7fdafbe131e7]    CNTK::  CreateCompositeMinibatchSource  (CNTK::MinibatchSourceConfig const&) + 0x27
[0x7fdafc7872f9]                                                       + 0xea2f9
[0x7fdb278c65e9]    PyCFunction_Call                                   + 0xf9
[0x7fdb2794dbd5]    PyEval_EvalFrameEx                                 + 0x8fb5
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ecd8]    PyEval_EvalCodeEx                                  + 0x48
[0x7fdb278a4661]                                                       + 0x9a661
[0x7fdb27871236]    PyObject_Call                                      + 0x56
[0x7fdb2788d77c]                                                       + 0x8377c
[0x7fdb27871236]    PyObject_Call                                      + 0x56
[0x7fdb278e24c3]                                                       + 0xd84c3
[0x7fdb278d8daf]                                                       + 0xcedaf
[0x7fdb27871236]    PyObject_Call                                      + 0x56
[0x7fdb2794b314]    PyEval_EvalFrameEx                                 + 0x66f4
[0x7fdb2794e166]    PyEval_EvalFrameEx                                 + 0x9546 (x2)
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ecd8]    PyEval_EvalCodeEx                                  + 0x48
[0x7fdb2794ed1b]    PyEval_EvalCode                                    + 0x3b
[0x7fdb27941dfe]                                                       + 0x137dfe
[0x7fdb278c65e9]    PyCFunction_Call                                   + 0xf9
[0x7fdb2794dbd5]    PyEval_EvalFrameEx                                 + 0x8fb5
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ddf5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ddf5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ecd8]    PyEval_EvalCodeEx                                  + 0x48
[0x7fdb278a4661]                                                       + 0x9a661
[0x7fdb27871236]    PyObject_Call                                      + 0x56
[0x7fdb2794b234]    PyEval_EvalFrameEx                                 + 0x6614
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ddf5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ddf5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fdb2794e166]    PyEval_EvalFrameEx                                 + 0x9546 (x2)
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ecd8]    PyEval_EvalCodeEx                                  + 0x48
[0x7fdb278a4661]                                                       + 0x9a661
[0x7fdb27871236]    PyObject_Call                                      + 0x56
[0x7fdb2794b234]    PyEval_EvalFrameEx                                 + 0x6614
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ecd8]    PyEval_EvalCodeEx                                  + 0x48
[0x7fdb278a4661]                                                       + 0x9a661
[0x7fdb27871236]    PyObject_Call                                      + 0x56
[0x7fdb2794b234]    PyEval_EvalFrameEx                                 + 0x6614
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ddf5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fdb2794e166]    PyEval_EvalFrameEx                                 + 0x9546
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ecd8]    PyEval_EvalCodeEx                                  + 0x48
[0x7fdb278a4661]                                                       + 0x9a661
[0x7fdb27871236]    PyObject_Call                                      + 0x56
[0x7fdb2794b234]    PyEval_EvalFrameEx                                 + 0x6614
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ddf5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fdb2794e166]    PyEval_EvalFrameEx                                 + 0x9546
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ddf5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fdb2794e166]    PyEval_EvalFrameEx                                 + 0x9546
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ddf5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ecd8]    PyEval_EvalCodeEx                                  + 0x48
[0x7fdb2794ed1b]    PyEval_EvalCode                                    + 0x3b
[0x7fdb27941dfe]                                                       + 0x137dfe
[0x7fdb278c65e9]    PyCFunction_Call                                   + 0xf9
[0x7fdb2794dbd5]    PyEval_EvalFrameEx                                 + 0x8fb5
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ddf5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fdb2794eb49]                                                       + 0x144b49
[0x7fdb2794ecd8]    PyEval_EvalCodeEx                                  + 0x48
[0x7fdb278a4542]                                                       + 0x9a542
[0x7fdb27871236]    PyObject_Call                                      + 0x56
[0x7fdb2798f8a2]                                                       + 0x1858a2
[0x7fdb27990565]    Py_Main                                            + 0x945
[0x400add]          main                                               + 0x15d
[0x7fdb2692c830]    __libc_start_main                                  + 0xf0
[0x4008b9]                                                            
