In [1]:
import os
import numpy as np
import mxnet as mx

In [2]:
data_dir = 'data'

def Perplexity(label, pred):
    """ Calculates prediction perplexity
    Args:
        label (mx.nd.array): labels array
        pred (mx.nd.array): prediction array
    Returns:
        float: calculated perplexity
    """
    # collapse the time, batch dimension
    label = label.reshape((-1,))
    pred = pred.reshape((-1, pred.shape[-1]))

    loss = 0.
    for i in range(pred.shape[0]):
        loss += -np.log(max(1e-10, pred[i][int(label[i])]))
    return np.exp(loss / label.size)

def default_read_content(path):
    with open(path) as ins:
        return ins.read()[:-1]

def default_build_vocab(path):
    content = default_read_content(path)
    content = content.replace('\n', ',').split(',')
    return map(lambda d: int(d), set(content))

def build_paths(path, buckets):
    content = default_read_content(path)
    paths = content.split('\n')
    data = []
    
    for path in paths:
        path = map(lambda d: int(d), path.split(','))

        if len(path) == 0:
            continue
        
        data.append(path)
    
    return data

In [3]:
batch_size = 32
buckets = [11, 21, 31, 41]
num_hidden = 200
num_embed = 200
num_layers = 2

num_epoch = 20
learning_rate = 0.01
momentum = 0.9
invalid_label = 0

# Update count per available GPUs
gpu_count = 1
contexts = [mx.context.gpu(i) for i in range(gpu_count)]
# contexts = mx.cpu()
vocab = default_build_vocab(os.path.join(data_dir, 'path_train.txt'))
vocab.append(0)
len(vocab)

214

In [4]:
train_paths = build_paths(os.path.join(data_dir, 'path_train.txt'), buckets)
val_paths = build_paths(os.path.join(data_dir, 'path_val.txt'), buckets)
test_paths = build_paths(os.path.join(data_dir, 'path_test.txt'), buckets)

In [5]:
data_train  = mx.rnn.BucketSentenceIter(train_paths, batch_size, buckets=buckets,
                                            invalid_label=invalid_label)
data_val    = mx.rnn.BucketSentenceIter(val_paths, batch_size, buckets=buckets,
                                            invalid_label=invalid_label)



In [31]:
stack = mx.rnn.SequentialRNNCell()

for i in range(num_layers):
    stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_'%i))

def sym_gen(seq_len):
    data = mx.sym.Variable('data')
    label = mx.sym.Variable('softmax_label')
    embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
                             output_dim=num_embed, name='embed')

    stack.reset()
    outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True)

    pred = mx.sym.Reshape(outputs, shape=(-1, num_hidden))
    pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred')

    label = mx.sym.Reshape(label, shape=(-1,))
    pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax')

    return pred, ('data',), ('softmax_label',)

In [21]:
data_train.default_bucket_key

41

In [36]:
gpu_count = 1
contexts = [mx.context.gpu(i) for i in range(gpu_count)]

model = mx.mod.BucketingModule(
        sym_gen             = sym_gen,
        default_bucket_key  = data_train.default_bucket_key,
        context             = contexts)

In [43]:
import logging

head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)

# TODO: add epoch_end_callback

model.fit(
    train_data          = data_train,
    eval_data           = data_val,
    eval_metric         = mx.metric.np(Perplexity),
    optimizer           = 'sgd',
    optimizer_params    = { 'learning_rate': 0.01,
                            'momentum': 0.9,
                            'wd': 0.00001 },
    initializer         = mx.init.Xavier(factor_type="in", magnitude=2.34),
    num_epoch           = 2,
    batch_end_callback  = mx.callback.Speedometer(batch_size, 50))



In [111]:
data = [[122, 9]]
dataiter = mx.io.NDArrayIter(data, None, 1, True)

In [112]:
model.predict(dataiter)

MXNetError: Invalid Parameter format for num_outputs expect int but value='None', in operator SliceChannel(name="", squeeze_axis="1", axis="1", num_outputs="None")

In [98]:
data_pred = mx.rnn.BucketSentenceIter(test_paths, batch_size=batch_size, buckets=buckets,
                                            invalid_label=invalid_label)



In [100]:
tmp = model.predict(data_pred)

In [103]:
tmp.reshape(-1, batch_size, len(vocab))

TypeError: reshape() takes exactly 2 arguments (4 given)

In [20]:
data_val    = mx.rnn.BucketSentenceIter(val_paths, batch_size, buckets=buckets,
                                            invalid_label=invalid_label)

for batch in data_val:
    print(batch.pad)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [25]:
def default_gen_buckets(sentences, batch_size):
    len_dict = {}
    max_len = -1
    for sentence in sentences:
        words = sentence
        if len(words) == 0:
            continue
        if len(words) > max_len:
            max_len = len(words)
        if len(words) in len_dict:
            len_dict[len(words)] += 1
        else:
            len_dict[len(words)] = 1
    print(len_dict)

    tl = 0
    buckets = []
    for l, n in len_dict.items(): # TODO: There are better heuristic ways to do this
        if n + tl >= batch_size:
            buckets.append(l)
            tl = 0
        else:
            tl += n
    if tl > 0:
        buckets.append(max_len)
    return buckets

In [26]:
default_gen_buckets(val_paths, 32)

{2: 33, 3: 59, 4: 65, 5: 76, 6: 75, 7: 119, 8: 140, 9: 167, 10: 143, 11: 162, 12: 153, 13: 171, 14: 173, 15: 151, 16: 152, 17: 160, 18: 117, 19: 107, 20: 89, 21: 88, 22: 88, 23: 76, 24: 58, 25: 47, 26: 40, 27: 24, 28: 31, 29: 19, 30: 17, 31: 17, 32: 8, 33: 5, 34: 7, 35: 2, 36: 1, 37: 3, 38: 1}


[2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 28,
 30,
 34,
 38]