In [1]:
from mxnet import nd
import random
import zipfile

In [3]:
with zipfile.ZipFile('../data/jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')
corpus_chars[:40]

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

In [4]:
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[0:10000]

In [8]:
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
vocab_size

1027

In [15]:
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:', ''.join(idx_to_char[idx] for idx in sample))
print('indices:', sample)

chars: 想要有直升机 想要和你飞到宇宙去 想要和
indices: [329, 38, 388, 857, 411, 373, 608, 329, 38, 518, 502, 972, 885, 131, 381, 574, 608, 329, 38, 518]


In [19]:
idx_to_char[329]

'想'

In [26]:
def data_iter_random(corpus_indics, batch_size, num_steps, ctx=None):
    num_examples = (len(corpus_indics) - 1) // num_steps
    epoch_size = num_examples // batch_size
    example_indices = list(range(num_examples))
    random.shuffle(example_indices)

    def _data(pos):
        return corpus_indices[pos: pos + num_steps]

    for i in range(epoch_size):
        i = i * batch_size
        batch_indices = example_indices[i:i + batch_size]
        X = [_data(j * num_steps) for j in batch_indices]
        Y = [_data(j * num_steps + 1) for j in batch_indices]
        yield nd.array(X, ctx), nd.array(Y, ctx)

In [27]:
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('X:', X, '\nY', Y, '\n')
X

X: 
[[608. 329.  38. 518. 502. 972.]
 [885. 131. 381. 574. 608. 329.]]
<NDArray 2x6 @cpu(0)> 
Y 
[[329.  38. 518. 502. 972. 885.]
 [131. 381. 574. 608. 329.  38.]]
<NDArray 2x6 @cpu(0)> 

X: 
[[ 38. 518. 502. 775. 202. 926.]
 [329.  38. 388. 857. 411. 373.]]
<NDArray 2x6 @cpu(0)> 
Y 
[[518. 502. 775. 202. 926. 408.]
 [ 38. 388. 857. 411. 373. 608.]]
<NDArray 2x6 @cpu(0)> 




[[ 38. 518. 502. 775. 202. 926.]
 [329.  38. 388. 857. 411. 373.]]
<NDArray 2x6 @cpu(0)>

In [30]:
#相邻采样
def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
    corpus_indices = nd.array(corpus_indices, ctx=ctx)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0: batch_size*batch_len].reshape((
        batch_size, batch_len))
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:, i:i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y

In [31]:
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X:', X, '\nY', Y, '\n')
X

X: 
[[ 0.  1.  2.  3.  4.  5.]
 [15. 16. 17. 18. 19. 20.]]
<NDArray 2x6 @cpu(0)> 
Y 
[[ 1.  2.  3.  4.  5.  6.]
 [16. 17. 18. 19. 20. 21.]]
<NDArray 2x6 @cpu(0)> 

X: 
[[ 6.  7.  8.  9. 10. 11.]
 [21. 22. 23. 24. 25. 26.]]
<NDArray 2x6 @cpu(0)> 
Y 
[[ 7.  8.  9. 10. 11. 12.]
 [22. 23. 24. 25. 26. 27.]]
<NDArray 2x6 @cpu(0)> 




[[ 6.  7.  8.  9. 10. 11.]
 [21. 22. 23. 24. 25. 26.]]
<NDArray 2x6 @cpu(0)>