# 循环神经网络

## 1. 数据准备

将训练语料转换为字典形式

In [1]:
%matplotlib inline
import zipfile
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [11]:
def load_data_jay_lyrics(num_sample=10000):
    """
    加载周杰伦歌词数据集
    """
    # 读取数据集
    with open('../dataset/jaychou_lyrics.txt', encoding='utf-8') as f:
        corpus_chars = f.read()
    # 把换行符替换为空格
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[:num_sample]
    # 建立字符索引
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    # 将训练集中每个字符转换为索引
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    # 返回索引后的前num_sample个字符的文本，字符到索引的映射，索引到字符的映射，字符表大小
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

In [12]:
corpus_indices, char_to_idx, idx_to_char, vocab_size = load_data_jay_lyrics()

In [13]:
vocab_size

1027

In [14]:
len(corpus_indices)   # 10000

10000

In [15]:
char_to_idx['风']

751

In [16]:
idx_to_char[371]

'映'

## 2. 采样

### 2.1 随机采样

下⾯的代码每次从数据⾥随机采样⼀个小批量。其中批量⼤小batch_size指每个小批量的样本数，num_steps为每个样本所包含的时间步数。在随机采样中，每个样本是原始序列上任意截取的⼀段序列。相邻的两个随机小批量在原始序列上的位置不⼀定相毗邻。因此，我们⽆法⽤⼀个小批量最终时间步的隐藏状态来初始化下⼀个小批量的隐藏状态。在训练模型时，每次随机采样前都需要重新初始化隐藏状态。

In [17]:
def data_iter_random(corpus_indices, batch_size, num_steps):
    '''
    corpus_indices: 词典按先后次序的索引
    batch_size: 每个批次的样本容量
    num_steps: 每个样本的长度
    '''
    num_examples = (len(corpus_indices) - 1) // num_steps  # 可取的样本数量
    epoch_size = num_examples // batch_size  # 总词汇数量 / (样本长度 * 样本数量)
    example_indices = list(range(num_examples))
    np.random.shuffle(example_indices)  # 打乱索引的顺序，即随机采样
      
    for i in range(epoch_size):
        # 每次读取batch_size个随机样本
        batch_indices = example_indices[i*batch_size: (i + 1)*batch_size ]
        X = [corpus_indices[j*num_steps: (j + 1)*num_steps] for j in batch_indices]
        Y = [corpus_indices[j*num_steps + 1: (j + 1)*num_steps + 1] for j in batch_indices]
        yield torch.IntTensor(X), torch.IntTensor(Y)

In [18]:
my_seq = list(range(30))
i = 0
for X, Y in data_iter_random(my_seq, batch_size=3, num_steps=6):
    print(i)
    print('X:\n', X, '\nY:\n', Y)
    i += 1

0
X:
 tensor([[ 6,  7,  8,  9, 10, 11],
        [18, 19, 20, 21, 22, 23],
        [12, 13, 14, 15, 16, 17]], dtype=torch.int32) 
Y:
 tensor([[ 7,  8,  9, 10, 11, 12],
        [19, 20, 21, 22, 23, 24],
        [13, 14, 15, 16, 17, 18]], dtype=torch.int32)


### 2.2 相邻采样

除对原始序列做随机采样之外，我们还可以令相邻的两个随机小批量在原始序列上的位置相毗邻。这时候，我们就可以用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态，从而使下一个小批量的输出也取决于当前小批量的输入，并如此循环下去。这对实现循环神经网络造成了两方面影响：一方面， 在训练模型时，我们只需在每一个迭代周期开始时初始化隐藏状态；另一方面，当多个相邻小批量通过传递隐藏状态串联起来时，模型参数的梯度计算将依赖所有串联起来的小批量序列。同一迭代周期中，随着迭代次数的增加，梯度的计算开销会越来越大。 为了使模型参数的梯度计算只依赖一次迭代读取的小批量序列，我们可以在每次读取小批量前将隐藏状态从计算图中分离出来。

In [19]:
# 本函数已保存在d2lzh包中方便以后使用
def data_iter_consecutive(corpus_indices, batch_size, num_steps):
    '''
    corpus_indices: 词典按先后次序的索引
    batch_size: 每个批次的样本容量
    num_steps: 每个样本的长度
    '''
    corpus_indices = np.array(corpus_indices)
    data_len = len(corpus_indices)  # 单词个数
    batch_len = data_len // batch_size  # 小批量的数量
    indices = corpus_indices[0: batch_size*batch_len].reshape(batch_size, batch_len)  # 先取总量，再塑形
    epoch_size = (batch_len - 1) // num_steps  # 批量数量
    for i in range(epoch_size):
        X = indices[:, i * num_steps: (i + 1) * num_steps]
        Y = indices[:, i * num_steps + 1: (i + 1) * num_steps + 1]
        yield torch.IntTensor(X), torch.IntTensor(Y)

In [20]:
my_seq = list(range(30))
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X:\n', X, '\nY:\n', Y)

X:
 tensor([[ 0,  1,  2,  3,  4,  5],
        [15, 16, 17, 18, 19, 20]], dtype=torch.int32) 
Y:
 tensor([[ 1,  2,  3,  4,  5,  6],
        [16, 17, 18, 19, 20, 21]], dtype=torch.int32)
X:
 tensor([[ 6,  7,  8,  9, 10, 11],
        [21, 22, 23, 24, 25, 26]], dtype=torch.int32) 
Y:
 tensor([[ 7,  8,  9, 10, 11, 12],
        [22, 23, 24, 25, 26, 27]], dtype=torch.int32)


## 2. `one_hot`编码

对语料中的每个不同单词进行`one_hot`编码

In [99]:
def one_hot(word_indices, vocab_size):
    '''
    word_indices: 需要编码的索引, torch.IntTensor
    vocab_size: 词典大小, scalar
    '''
    shape = list(word_indices.shape) + [vocab_size]
    res = torch.zeros(size=shape)
    if len(shape) == 2:
        res[range(shape[0]), word_indices] = 1
    elif len(shape) == 3:
        for i in range(shape[0]):
            for j in range(shape[1]):
                res[i, j, word_indices[i, j]] = 1
    else:
        print('X超过2维!')

    return res

In [100]:
x = torch.IntTensor([[1, 2],[2, 3],[3, 4]])
one_hot(x, 10)

tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]]])

每次采样的小批量的形状是(批量大小, 时间步数)，以下函数将字符下标转换成字符的`one-hot`编码。

In [122]:
def to_onehot(X, size):
    """
    X: n*t, n为批量大小，t为时间步长
    返回包含t个矩阵的列表, x: (批量大小, 词典大小)
    """
    return [one_hot(x, size) for x in X.t()]

In [126]:
X = torch.arange(10).reshape(2, 5)
inputs = to_onehot(X, vocab_size)

In [127]:
len(inputs), inputs[0].shape  # 时间步, (小批量大小, 输入数量)

(5, torch.Size([2, 1027]))

初始化模型参数

In [105]:
def get_params(num_inputs, num_hiddens, num_outputs):
    # 隐藏层参数
    W_xh = torch.randn(num_inputs, num_hiddens) * 0.01
    W_hh = torch.randn(num_hiddens, num_hiddens) * 0.01
    b_h = torch.zeros(num_hiddens)

    # 输出层参数
    W_ho = torch.randn(num_hiddens, num_outputs) * 0.01
    b_o = torch.zeros(num_outputs)

    # 附上梯度
    W_xh.requires_grad_(True)
    W_hh.requires_grad_(True)
    b_h.requires_grad_(True)
    W_ho.requires_grad_(True)
    b_o.requires_grad_(True)
    return W_xh, W_hh, b_h, W_ho, b_o

初始化隐藏层的值

In [106]:
def init_rnn_hidden_state(batch_size, num_hiddens):
    return torch.zeros(batch_size, num_hiddens),

构建循环神经网络

In [108]:
def rnn(inputs, state, params):
    '''
    计算len(inputs)时间步， 初始状态为state，以及参数为params下的小批量样本对应的输出
    '''
    W_xh, W_hh, b_h, W_ho, b_o = params
    H, = state
    outputs = []
    for X in inputs: # X: batch_size * vocab_size
        H = torch.tanh(X@W_xh + H@W_hh + b_h)  # batch_size * hidden_size
        Y = H@W_ho + b_o  # batch_size * vocab_size
        outputs.append(Y)

    return outputs, (H, )

In [131]:
X = torch.arange(10).reshape(2, 5)  # 一个批次，样本量为2，时间步为5
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size  # 
state = init_rnn_hidden_state(X.shape[0], num_hiddens)
inputs = to_onehot(X, vocab_size)
params = get_params(num_inputs, num_hiddens, num_outputs)
outputs, state_new = rnn(inputs, state, params)

In [132]:
len(outputs), outputs[0].shape, state_new[0].shape

(5, torch.Size([2, 1027]), torch.Size([2, 256]))

In [134]:
inputs[0].shape, params[0].shape

(torch.Size([2, 1027]), torch.Size([1027, 256]))

In [136]:
outputs

[tensor([[-1.1714e-03, -8.4034e-04,  2.3666e-03,  ..., -1.2565e-03,
           7.3106e-04,  1.1451e-03],
         [-1.6279e-03, -6.2816e-05, -3.3580e-03,  ..., -1.8083e-03,
          -2.9904e-04, -8.0677e-04]], grad_fn=<AddBackward0>),
 tensor([[-0.0024, -0.0007,  0.0013,  ...,  0.0024, -0.0008, -0.0016],
         [-0.0008,  0.0005,  0.0001,  ...,  0.0021,  0.0011, -0.0017]],
        grad_fn=<AddBackward0>),
 tensor([[-5.3381e-04, -2.1063e-03, -4.5478e-04,  ..., -2.3647e-03,
           3.2638e-04,  1.3879e-03],
         [ 2.0157e-03, -2.9879e-05, -1.4161e-03,  ...,  2.1266e-04,
          -5.3675e-04, -3.8626e-04]], grad_fn=<AddBackward0>),
 tensor([[ 0.0021,  0.0020,  0.0008,  ...,  0.0008,  0.0013, -0.0006],
         [-0.0014, -0.0033,  0.0016,  ..., -0.0014,  0.0030,  0.0010]],
        grad_fn=<AddBackward0>),
 tensor([[-2.0557e-05, -2.4210e-04, -8.0187e-04,  ..., -1.6461e-03,
           4.6133e-03, -9.1111e-04],
         [ 6.3191e-04, -3.1657e-03,  3.3037e-03,  ...,  9.7344e-04,
   