In [1]:

with open("../data/timemachine.txt") as f:
    time_machine = f.read()
print(time_machine[0:500])

The Time Machine, by H. G. Wells [1898]




I


The Time Traveller (for so it will be convenient to speak of him)
was expounding a recondite matter to us. His grey eyes shone and
twinkled, and his usually pale face was flushed and animated. The
fire burned brightly, and the soft radiance of the incandescent
lights in the lilies of silver caught the bubbles that flashed and
passed in our glasses. Our chairs, being his patents, embraced and
caressed us rather than submitted to be sat upon, and the


In [2]:
time_machine = time_machine.lower().replace('\n', '').replace('\r', '')
time_machine = time_machine[0:10000]

In [3]:
character_list = list(set(time_machine))
character_dict = dict([(char,i) for i,char in enumerate(character_list)])

vocab_size = len(character_dict)

print('vocab size:', vocab_size)
print(character_dict)

vocab size: 43
{'n': 0, 'u': 1, 'h': 2, '!': 3, 'x': 4, '1': 5, '-': 6, ')': 7, 'r': 8, 'e': 9, 'b': 10, 'j': 11, 's': 12, ' ': 13, ']': 14, ':': 15, '?': 16, 'm': 17, 'd': 18, 'f': 19, "'": 20, 'y': 21, 't': 22, 'i': 23, 'g': 24, 'c': 25, '(': 26, '8': 27, 'z': 28, 'l': 29, 'v': 30, 'o': 31, '9': 32, '.': 33, 'w': 34, 'k': 35, ',': 36, '_': 37, ';': 38, '[': 39, 'p': 40, 'a': 41, 'q': 42}


In [4]:
time_numerical = [character_dict[char] for char in time_machine]

sample = time_numerical[:40]

print('chars: \n', ''.join([character_list[idx] for idx in sample]))
print('\nindices: \n', sample)


chars: 
 the time machine, by h. g. wells [1898]i

indices: 
 [22, 2, 9, 13, 22, 23, 17, 9, 13, 17, 41, 25, 2, 23, 0, 9, 36, 13, 10, 21, 13, 2, 33, 13, 24, 33, 13, 34, 9, 29, 29, 12, 13, 39, 5, 27, 32, 27, 14, 23]


In [5]:
import random
from mxnet import nd

def data_iter(batch_size, seq_len, ctx=None):
    num_examples = (len(time_numerical)-1) // seq_len
    num_batches = num_examples // batch_size
    # 随机化样本
    idx = list(range(num_examples))
    random.shuffle(idx)
    # 返回seq_len个数据
    def _data(pos):
        return time_numerical[pos:pos+seq_len]
    for i in range(num_batches):
        # 每次读取batch_size个随机样本
        i = i * batch_size
        examples = idx[i:i+batch_size]
        data = nd.array(
            [_data(j*seq_len) for j in examples], ctx=ctx)
        label = nd.array(
            [_data(j*seq_len+1) for j in examples], ctx=ctx)
        yield data, label

In [6]:
for data, label in data_iter(batch_size=3, seq_len=8):
    print('data: ', data, '\n\nlabel:', label)
    break

data:  
[[ 22.   2.   9.  18.  23.  17.   9.   0.]
 [ 22.   2.  13.  41.  13.  29.   9.  41.]
 [ 40.   9.   8.  23.  17.   9.   0.  22.]]
<NDArray 3x8 @cpu(0)> 

label: 
[[  2.   9.  18.  23.  17.   9.   0.  12.]
 [  2.  13.  41.  13.  29.   9.  41.   0.]
 [  9.   8.  23.  17.   9.   0.  22.  41.]]
<NDArray 3x8 @cpu(0)>


In [7]:
nd.one_hot(nd.array([0,4]), vocab_size)


[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.]]
<NDArray 2x43 @cpu(0)>

In [8]:
def get_inputs(data):
    return [nd.one_hot(X, vocab_size) for X in data.T]

inputs = get_inputs(data)
print('input length: ',len(inputs))
print('input[0] shape: ', inputs[0].shape)

input length:  8
input[0] shape:  (3, 43)


In [9]:
import mxnet as mx

# 尝试使用 GPU
import sys
sys.path.append('..')
import utils
ctx = utils.try_gpu()
print('Will use ', ctx)

num_hidden = 256
weight_scale = .01

# 隐含层
Wxh = nd.random_normal(shape=(vocab_size,num_hidden), ctx=ctx) * weight_scale
Whh = nd.random_normal(shape=(num_hidden,num_hidden), ctx=ctx) * weight_scale
bh = nd.zeros(num_hidden, ctx=ctx)
# 输出层
Why = nd.random_normal(shape=(num_hidden,vocab_size), ctx=ctx) * weight_scale
by = nd.zeros(vocab_size, ctx=ctx)

params = [Wxh, Whh, bh, Why, by]
for param in params:
    param.attach_grad()

Will use  gpu(0)


In [10]:
def rnn(inputs, H):
    # inputs: seq_len 个 batch_size x vocab_size 矩阵
    # H: batch_size x num_hidden 矩阵
    # outputs: seq_len 个 batch_size x vocab_size 矩阵
    outputs = []
    for X in inputs:
        H = nd.tanh(nd.dot(X, Wxh) + nd.dot(H, Whh) + bh)
        Y = nd.dot(H, Why) + by
        outputs.append(Y)
    return (outputs, H)

In [11]:
state = nd.zeros(shape=(data.shape[0], num_hidden), ctx=ctx)
outputs, state_new = rnn(get_inputs(data.as_in_context(ctx)), state)

print('output length: ',len(outputs))
print('output[0] shape: ', outputs[0].shape)
print('state shape: ', state_new.shape)

output length:  8
output[0] shape:  (3, 43)
state shape:  (3, 256)


In [12]:
def predict(prefix, num_chars):
    # 预测以 prefix 开始的接下来的 num_chars 个字符
    prefix = prefix.lower()
    state = nd.zeros(shape=(1, num_hidden), ctx=ctx)
    output = [character_dict[prefix[0]]]
    for i in range(num_chars+len(prefix)):
        X = nd.array([output[-1]], ctx=ctx)
        Y, state = rnn(get_inputs(X), state)
        #print(Y)
        if i < len(prefix)-1:
            next_input = character_dict[prefix[i+1]]
        else:
            next_input = int(Y[0].argmax(axis=1).asscalar())
        output.append(next_input)
    return ''.join([character_list[i] for i in output])

In [13]:

def grad_clipping(params, theta):
    norm = nd.array([0.0], ctx)
    for p in params:
        norm += nd.sum(p.grad ** 2)
    norm = nd.sqrt(norm).asscalar()
    if norm > theta:
        for p in params:
            p.grad[:] *= theta/norm

In [14]:
from mxnet import autograd
from mxnet import gluon
from math import exp

epochs = 200
seq_len = 35
learning_rate = .1
batch_size = 32

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

for e in range(epochs+1):
    train_loss, num_examples = 0, 0
    state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx)
    for data, label in data_iter(batch_size, seq_len, ctx):
        with autograd.record():
            outputs, state = rnn(get_inputs(data), state)
            # reshape label to (batch_size*seq_len, )
            # concate outputs to (batch_size*seq_len, vocab_size)
            label = label.T.reshape((-1,))
            outputs = nd.concat(*outputs, dim=0)
            loss = softmax_cross_entropy(outputs, label)
        loss.backward()

        grad_clipping(params, 5)
        utils.SGD(params, learning_rate)

        train_loss += nd.sum(loss).asscalar()
        num_examples += loss.size

    if e % 20 == 0:
        print("Epoch %d. PPL %f" % (e, exp(train_loss/num_examples)))
        print(' - ', predict('The Time Ma', 100))
        print(' - ', predict("The Medical Man rose, came to the lamp,", 100), '\n')


Epoch 0. PPL 31.024387
 -  the time maeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
 -  the medical man rose, came to the lamp,eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee 

Epoch 20. PPL 11.102707
 -  the time mathe the the the the the the the the the the the the the the the the the the the the the the the the t
 -  the medical man rose, came to the lamp, and and and and and and and and and and and and and and and and and and and and and and and and and  

Epoch 40. PPL 9.391547
 -  the time mave the the the the the the the the the the the the the the the the the the the the the the the the th
 -  the medical man rose, came to the lamp, and and and and and and and and and and and and and and and and and and and and and and and and and  

Epoch 60. PPL 8.295995
 -  the time mave the the the the the the the the the the the the the the the the the the the the the the the the t