In [2]:
import os
import numpy as np
import mxnet as mx

from bucket_io import BucketSentenceIter, default_build_vocab

data_dir = 'data'

def Perplexity(label, pred):
    """ Calculates prediction perplexity
    Args:
        label (mx.nd.array): labels array
        pred (mx.nd.array): prediction array
    Returns:
        float: calculated perplexity
    """

    # collapse the time, batch dimension
    label = label.reshape((-1,))
    pred = pred.reshape((-1, pred.shape[-1]))

    loss = 0.
    for i in range(pred.shape[0]):
        loss += -np.log(max(1e-10, pred[i][int(label[i])]))
    return np.exp(loss / label.size)

In [3]:
batch_size = 128
buckets = [10, 20, 30, 40, 50, 60]
num_hidden = 200
num_embed = 200
num_lstm_layer = 2

num_epoch = 2
learning_rate = 0.01
momentum = 0.0

# Update count per available GPUs
gpu_count = 1
contexts = [mx.context.gpu(i) for i in range(gpu_count)]

vocab = default_build_vocab(os.path.join(data_dir, 'ptb.train.txt'))

init_h = [mx.io.DataDesc('LSTM_state', (num_lstm_layer, batch_size, num_hidden), layout='TNC')]
init_c = [mx.io.DataDesc('LSTM_state_cell', (num_lstm_layer, batch_size, num_hidden), layout='TNC')]
init_states = init_c + init_h

data_train = BucketSentenceIter(os.path.join(data_dir, 'ptb.train.txt'),
                                vocab, buckets, batch_size, init_states,
                                time_major=True)
data_val = BucketSentenceIter(os.path.join(data_dir, 'ptb.valid.txt'),
                              vocab, buckets, batch_size, init_states,
                              time_major=True)

def sym_gen(seq_len):
    """ Generates the MXNet symbol for the RNN
    Args:
        seq_len (int): input sequence length
    Returns:
        tuple: tuple containing symbol, data_names, label_names
    """
    data = mx.sym.Variable('data')
    label = mx.sym.Variable('softmax_label')
    embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
                             output_dim=num_embed, name='embed')

    # TODO(tofix)
    # currently all the LSTM parameters are concatenated as
    # a huge vector, and named '<name>_parameters'. By default
    # mxnet initializer does not know how to initilize this
    # guy because its name does not ends with _weight or _bias
    # or anything familiar. Here we just use a temp workaround
    # to create a variable and name it as LSTM_bias to get
    # this demo running. Note by default bias is initialized
    # as zeros, so this is not a good scheme. But calling it
    # LSTM_weight is not good, as this is 1D vector, while
    # the initialization scheme of a weight parameter needs
    # at least two dimensions.
    rnn_params = mx.sym.Variable('LSTM_bias')

    # RNN cell takes input of shape (time, batch, feature)
    rnn = mx.sym.RNN(data=embed, state_size=num_hidden,
                     num_layers=num_lstm_layer, mode='lstm',
                     name='LSTM',
                     # The following params can be omitted
                     # provided we do not need to apply the
                     # workarounds mentioned above
                     parameters=rnn_params)

    # the RNN cell output is of shape (time, batch, dim)
    # if we need the states and cell states in the last time
    # step (e.g. when building encoder-decoder models), we
    # can set state_outputs=True, and the RNN cell will have
    # extra outputs: rnn['LSTM_output'], rnn['LSTM_state']
    # and for LSTM, also rnn['LSTM_state_cell']

    # now we collapse the time and batch dimension to do the
    # final linear logistic regression prediction
    hidden = mx.sym.Reshape(data=rnn, shape=(-1, num_hidden))

    pred = mx.sym.FullyConnected(data=hidden, num_hidden=len(vocab),
                                 name='pred')

    # reshape to be of compatible shape as labels
    pred_tm = mx.sym.Reshape(data=pred, shape=(seq_len, -1, len(vocab)))

    sm = mx.sym.SoftmaxOutput(data=pred_tm, label=label, preserve_shape=True,
                              name='softmax')

    data_names = ['data', 'LSTM_state', 'LSTM_state_cell']
    label_names = ['softmax_label']

    return sm, data_names, label_names

if len(buckets) == 1:
    mod = mx.mod.Module(*sym_gen(buckets[0]), context=contexts)
else:
    mod = mx.mod.BucketingModule(sym_gen,
                                 default_bucket_key=data_train.default_bucket_key,
                                 context=contexts)

import logging

head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)

mod.fit(data_train, eval_data=data_val, num_epoch=num_epoch,
        eval_metric=mx.metric.np(Perplexity),
        batch_end_callback=mx.callback.Speedometer(batch_size, 50),
        initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
        optimizer='sgd',
        optimizer_params={'learning_rate': learning_rate,
                          'momentum': momentum, 'wd': 0.00001})

bucket of len  10 : 19479 samples
bucket of len  20 : 19336 samples
bucket of len  30 : 12208 samples
bucket of len  40 : 3962 samples
bucket of len  50 : 845 samples
bucket of len  60 : 160 samples
bucket of len  10 : 1531 samples
bucket of len  20 : 1518 samples
bucket of len  30 : 980 samples
bucket of len  40 : 322 samples
bucket of len  50 : 65 samples
bucket of len  60 : 10 samples


2018-02-22 16:29:24,340 Epoch[0] Batch [50]	Speed: 1477.57 samples/sec	Perplexity=4431.916907
2018-02-22 16:29:28,377 Epoch[0] Batch [100]	Speed: 1586.05 samples/sec	Perplexity=681.280784
2018-02-22 16:29:32,404 Epoch[0] Batch [150]	Speed: 1589.74 samples/sec	Perplexity=523.909548
2018-02-22 16:29:36,907 Epoch[0] Batch [200]	Speed: 1422.07 samples/sec	Perplexity=440.165320
2018-02-22 16:29:40,953 Epoch[0] Batch [250]	Speed: 1582.71 samples/sec	Perplexity=385.760612
2018-02-22 16:29:45,016 Epoch[0] Batch [300]	Speed: 1575.95 samples/sec	Perplexity=365.302179
2018-02-22 16:29:49,033 Epoch[0] Batch [350]	Speed: 1594.13 samples/sec	Perplexity=343.126742
2018-02-22 16:29:53,342 Epoch[0] Batch [400]	Speed: 1486.12 samples/sec	Perplexity=326.637865
2018-02-22 16:29:56,089 Epoch[0] Train-Perplexity=301.927836
2018-02-22 16:29:56,091 Epoch[0] Time cost=36.191
2018-02-22 16:29:58,630 Epoch[0] Validation-Perplexity=279.466159
2018-02-22 16:30:02,871 Epoch[1] Batch [50]	Speed: 1526.67 samples/sec	

In [5]:
data_test = BucketSentenceIter(os.path.join(data_dir, 'ptb.test.txt'),
                              vocab, buckets, batch_size, init_states,
                              time_major=True)

bucket of len  10 : 1726 samples
bucket of len  20 : 1699 samples
bucket of len  30 : 1108 samples
bucket of len  40 : 355 samples
bucket of len  50 : 69 samples
bucket of len  60 : 14 samples


In [6]:
perplexity = mx.metric.np(Perplexity)
mod.score(data_test, perplexity)
print(perplexity)

EvalMetric: {'Perplexity': 226.84756651445085}


In [1]:
test_pred = mod.predict(data_test)
test_pred.shape

NameError: name 'mod' is not defined

(710L, 128L, 9959L)

In [71]:
data_test.data[0][1]

array([   33.,    27.,    70.,   989.,  5436.,     0.,     0.,     0.,
           0.,     0.])

In [68]:
len(data_test.data[0][0])

10

In [9]:
vocab[' ']

0