# RNN实验

## 导入包

In [1]:
# -*- coding:utf-8 -*-
# 导入包
from mxnet import nd
import random
import zipfile

## 处理数据

In [5]:
with open('./data/JAY.txt', 'r', encoding='UTF-8') as f:
    corpus_chars = f.read()

# print(type(corpus_chars))
# print(corpus_chars[:40])
print(len(corpus_chars))
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
print(len(corpus_chars))
# print(corpus_chars)
corpus_chars = corpus_chars[:10000]

102085
102085


## 建立字符索引

### 生成 [字符：索引] 的对应

In [11]:
idx_to_char = list(set(corpus_chars))
# 以元组作为输入可以构造字典的键值对
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
vocab_size

1194

### 将语料转为索引表示

In [12]:
corpus_indices = [char_to_idx[char] for char in corpus_chars]  # corpus_indices是一个list
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars: 歌曲名称：反方向的钟 搜索试听   所属
indices: [637, 555, 254, 669, 259, 617, 205, 886, 731, 638, 827, 959, 848, 1082, 631, 827, 827, 827, 1061, 663]


## 时序数据采样

### 随机采样

In [13]:
# X,Y数据读入函数
def data_iter_random(corpus_indices, batch_size, num_steps, ctx=None):
    # -1
    num_examples = (len(corpus_indices)-1) // num_steps
    epoch_size = num_examples // batch_size
    examples_indices = list(range(num_examples))
    random.shuffle(examples_indices)
    
    # 返回从pos开始的长为num_steps的序列
    def _data(pos):
        return corpus_indices[pos: pos+num_steps]
    
    for i in range(epoch_size):
        # 使用shuffle过的examples_indices进行随机抽样
        i = i * batch_size
        batch_indices = examples_indices[i: i + batch_size]
        X = [_data(j * num_steps) for j in batch_indices]
        Y = [_data(j * num_steps + 1) for j in batch_indices]
        yield nd.array(X, ctx), nd.array(Y, ctx)

In [14]:
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY', Y, '\n')

X:  
[[ 6.  7.  8.  9. 10. 11.]
 [18. 19. 20. 21. 22. 23.]]
<NDArray 2x6 @cpu(0)> 
Y 
[[ 7.  8.  9. 10. 11. 12.]
 [19. 20. 21. 22. 23. 24.]]
<NDArray 2x6 @cpu(0)> 

X:  
[[ 0.  1.  2.  3.  4.  5.]
 [12. 13. 14. 15. 16. 17.]]
<NDArray 2x6 @cpu(0)> 
Y 
[[ 1.  2.  3.  4.  5.  6.]
 [13. 14. 15. 16. 17. 18.]]
<NDArray 2x6 @cpu(0)> 



### 相邻采样

In [20]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
    corpus_indices = nd.array(corpus_indices, ctx=ctx)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size 
    indices = corpus_indices[0: batch_size*batch_len].reshape((
        batch_size, 
        batch_len)) 
    epoch_size = (batch_len - 1) // num_steps 
    for i in range(epoch_size): 
        i = i * num_steps 
        X = indices[:, i: i + num_steps] 
        Y = indices[:, i + 1: i + num_steps + 1] 
        yield X, Y 
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY', Y, '\n')

X:  
[[ 0.  1.  2.  3.  4.  5.]
 [15. 16. 17. 18. 19. 20.]]
<NDArray 2x6 @cpu(0)> 
Y 
[[ 1.  2.  3.  4.  5.  6.]
 [16. 17. 18. 19. 20. 21.]]
<NDArray 2x6 @cpu(0)> 

X:  
[[ 6.  7.  8.  9. 10. 11.]
 [21. 22. 23. 24. 25. 26.]]
<NDArray 2x6 @cpu(0)> 
Y 
[[ 7.  8.  9. 10. 11. 12.]
 [22. 23. 24. 25. 26. 27.]]
<NDArray 2x6 @cpu(0)> 



## 总结
batch_size是指一次选几个样本放进网络训练，但是对于上述的数据处理中，单字的样本会被组织成6个连续字组成的一个序列。所以在使用batch_size进行数据的分批时，要先把握好将数据每6个分成一个时间步数的原则，再考虑batch_size和epoch_size

## 问题

- 如果希望序列样本是一个完整句子，小批量采样会有什么问题
    1. 每个样本长度不同