# 循环神经网络

## 1. 数据准备

将训练语料转换为字典形式

In [65]:
%matplotlib inline
import zipfile
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
def load_data_jay_lyrics(num_sample=10000):
    """
    加载周杰伦歌词数据集
    """
    # 读取数据集
    with open('../dataset/jaychou_lyrics.txt', encoding='utf-8') as f:
        corpus_chars = f.read()
    # 把换行符替换为空格
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[:num_sample]
    # 建立字符索引
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    # 将训练集中每个字符转换为索引
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    # 返回索引后的前num_sample个字符的文本，字符到索引的映射，索引到字符的映射，字符表大小
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

In [3]:
corpus_indices, char_to_idx, idx_to_char, vocab_size = load_data_jay_lyrics()

In [4]:
vocab_size

1027

In [5]:
len(corpus_indices)   # 10000

10000

In [6]:
char_to_idx['风']

290

In [7]:
idx_to_char[371]

'翻'

## 2. 采样

### 2.1 随机采样

下⾯的代码每次从数据⾥随机采样⼀个小批量。其中批量⼤小batch_size指每个小批量的样本数，num_steps为每个样本所包含的时间步数。在随机采样中，每个样本是原始序列上任意截取的⼀段序列。相邻的两个随机小批量在原始序列上的位置不⼀定相毗邻。因此，我们⽆法⽤⼀个小批量最终时间步的隐藏状态来初始化下⼀个小批量的隐藏状态。在训练模型时，每次随机采样前都需要重新初始化隐藏状态。

In [8]:
def data_iter_random(corpus_indices, batch_size, num_steps):
    '''
    corpus_indices: 词典按先后次序的索引
    batch_size: 每个批次的样本容量
    num_steps: 每个样本的长度
    '''
    num_examples = (len(corpus_indices) - 1) // num_steps  # 可取的样本数量
    epoch_size = num_examples // batch_size  # 总词汇数量 / (样本长度 * 样本数量)
    example_indices = list(range(num_examples))
    np.random.shuffle(example_indices)  # 打乱索引的顺序，即随机采样
      
    for i in range(epoch_size):
        # 每次读取batch_size个随机样本
        batch_indices = example_indices[i*batch_size: (i + 1)*batch_size ]
        X = [corpus_indices[j*num_steps: (j + 1)*num_steps] for j in batch_indices]
        Y = [corpus_indices[j*num_steps + 1: (j + 1)*num_steps + 1] for j in batch_indices]
        yield torch.IntTensor(X), torch.IntTensor(Y)

In [9]:
my_seq = list(range(30))
i = 0
for X, Y in data_iter_random(my_seq, batch_size=3, num_steps=6):
    print(i)
    print('X:\n', X, '\nY:\n', Y)
    i += 1

0
X:
 tensor([[12, 13, 14, 15, 16, 17],
        [ 6,  7,  8,  9, 10, 11],
        [ 0,  1,  2,  3,  4,  5]], dtype=torch.int32) 
Y:
 tensor([[13, 14, 15, 16, 17, 18],
        [ 7,  8,  9, 10, 11, 12],
        [ 1,  2,  3,  4,  5,  6]], dtype=torch.int32)


### 2.2 相邻采样

除对原始序列做随机采样之外，我们还可以令相邻的两个随机小批量在原始序列上的位置相毗邻。这时候，我们就可以用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态，从而使下一个小批量的输出也取决于当前小批量的输入，并如此循环下去。这对实现循环神经网络造成了两方面影响：一方面， 在训练模型时，我们只需在每一个迭代周期开始时初始化隐藏状态；另一方面，当多个相邻小批量通过传递隐藏状态串联起来时，模型参数的梯度计算将依赖所有串联起来的小批量序列。同一迭代周期中，随着迭代次数的增加，梯度的计算开销会越来越大。 为了使模型参数的梯度计算只依赖一次迭代读取的小批量序列，我们可以在每次读取小批量前将隐藏状态从计算图中分离出来。

In [10]:
# 本函数已保存在d2lzh包中方便以后使用
def data_iter_consecutive(corpus_indices, batch_size, num_steps):
    '''
    corpus_indices: 词典按先后次序的索引
    batch_size: 每个批次的样本容量
    num_steps: 每个样本的长度
    '''
    corpus_indices = np.array(corpus_indices)
    data_len = len(corpus_indices)  # 单词个数
    batch_len = data_len // batch_size  # 小批量的数量
    indices = corpus_indices[0: batch_size*batch_len].reshape(batch_size, batch_len)  # 先取总量，再塑形
    epoch_size = (batch_len - 1) // num_steps  # 批量数量
    for i in range(epoch_size):
        X = indices[:, i * num_steps: (i + 1) * num_steps]
        Y = indices[:, i * num_steps + 1: (i + 1) * num_steps + 1]
        yield torch.IntTensor(X), torch.IntTensor(Y)

In [11]:
my_seq = list(range(30))
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X:\n', X, '\nY:\n', Y)

X:
 tensor([[ 0,  1,  2,  3,  4,  5],
        [15, 16, 17, 18, 19, 20]], dtype=torch.int32) 
Y:
 tensor([[ 1,  2,  3,  4,  5,  6],
        [16, 17, 18, 19, 20, 21]], dtype=torch.int32)
X:
 tensor([[ 6,  7,  8,  9, 10, 11],
        [21, 22, 23, 24, 25, 26]], dtype=torch.int32) 
Y:
 tensor([[ 7,  8,  9, 10, 11, 12],
        [22, 23, 24, 25, 26, 27]], dtype=torch.int32)


## 2. `one_hot`编码

对语料中的每个不同单词进行`one_hot`编码。编码长度为字典长度，单词的索引对应的位置的编码值为1，其余为0。

In [51]:
def one_hot(word_indices, vocab_size):
    '''
    word_indices: 需要编码的索引, torch.IntTensor
    vocab_size: 词典大小, scalar
    '''
    shape = list(word_indices.shape) + [vocab_size]
    res = torch.zeros(size=shape)
    if len(shape) == 2:
        res[range(shape[0]), word_indices] = 1
    elif len(shape) == 3:
        for i in range(shape[0]):
            for j in range(shape[1]):
                res[i, j, word_indices[i, j]] = 1
    else:
        print('X超过2维!')

    return res

In [13]:
x = torch.IntTensor([[1, 2],[2, 3],[3, 4]])
one_hot(x, 10)

tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]]])

每次采样的小批量的形状是(批量大小, 时间步数)，以下函数将字符下标转换成字符的`one-hot`编码。

In [14]:
def to_onehot(X, size):
    """
    X: n*t, n为批量大小，t为时间步长
    size: 词典大小
    返回包含t个矩阵的列表, x: (批量大小, 词典大小)
    """
    return [one_hot(x, size) for x in X.t()]

In [15]:
X = torch.arange(10).reshape(2, 5)
inputs = to_onehot(X, vocab_size)

In [16]:
len(inputs), inputs[0].shape  # 时间步, (小批量大小, 输入数量)

(5, torch.Size([2, 1027]))

## 3. 模型实现

### 3.1 初始化模型参数

In [17]:
def get_params(num_inputs, num_hiddens, num_outputs):
    '''
    num_inputs: 输入层结点数量
    num_hiddens: 隐藏层结点数量
    num_outputs: 输出层结点数量
    '''
    # 隐藏层参数
    W_xh = torch.randn(num_inputs, num_hiddens) * 0.01
    W_hh = torch.randn(num_hiddens, num_hiddens) * 0.01
    b_h = torch.zeros(num_hiddens)

    # 输出层参数
    W_ho = torch.randn(num_hiddens, num_outputs) * 0.01
    b_o = torch.zeros(num_outputs)

    # 附上梯度
    W_xh.requires_grad_(True)
    W_hh.requires_grad_(True)
    b_h.requires_grad_(True)
    W_ho.requires_grad_(True)
    b_o.requires_grad_(True)
    return W_xh, W_hh, b_h, W_ho, b_o

### 3.2 初始化隐藏层的值

In [18]:
def init_rnn_hidden_state(batch_size, num_hiddens):
    '''
    batch_size: 每个批量的样本量
    num_hiddens: 隐藏层结点数量
    '''
    return torch.zeros(batch_size, num_hiddens),

### 3.3 构建循环神经网络

In [19]:
def rnn(inputs, state, params):
    '''
    inputs: 各时间步(batch_size, vocab_size)构成的张量
    state: 初始隐藏层结点状态(batch_size, num_hiddens)
    params: 输入-隐藏，隐藏-隐藏，隐藏-输出 参数
    计算len(inputs)时间步， 初始状态为state，以及参数为params下的小批量样本对应的输出
    '''
    W_xh, W_hh, b_h, W_ho, b_o = params
    H, = state
    outputs = []
    for X in inputs: # X: batch_size * vocab_size
        H = torch.tanh(X@W_xh + H@W_hh + b_h)  # batch_size * hidden_size
        Y = H@W_ho + b_o  # batch_size * vocab_size
        outputs.append(Y)

    return outputs, (H, )  # 相邻采样时, H作为下一个批量的初始状态

In [20]:
X = torch.arange(20).reshape(4, 5)  # 一个批次，样本量为4，时间步为5
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size  # 输入层数量，隐藏层数量，输出层数量
state = init_rnn_hidden_state(X.shape[0], num_hiddens)
inputs = to_onehot(X, vocab_size)
params = get_params(num_inputs, num_hiddens, num_outputs)
outputs, state_new = rnn(inputs, state, params)

In [46]:
X.shape

torch.Size([4, 5])

In [21]:
len(outputs), outputs[0].shape, state_new[0].shape

(5, torch.Size([4, 1027]), torch.Size([4, 256]))

### 3.4 预测前n个字符

In [49]:
# This function is saved in the d2l package for future use
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_hidden_state, num_hiddens, vocab_size, idx_to_char, char_to_idx):
    """
    prefix: 前n个字符
    num_chars: 往前预测的字符数
    rnn: rnn模型
    params: 预测使用的参数
    init_rnn_state: 初始化模型参数
    num_hiddens: 隐藏层结点数量
    vocab_size: 字典中的字符个数
    idx_to_char: {索引: 字符}
    char_to_idx: {字符: 索引}
    """
    state = init_rnn_hidden_state(1, num_hiddens)
    output = [char_to_idx[c] for c in prefix]  # 初始化输出为prefix
    
    for t in range(num_chars - 1):
        X = to_onehot(torch.LongTensor([[output[-1]]]), vocab_size)
        Y, state = rnn(X, state, params)
        output.append(int(Y[0].argmax(dim=1).item()))
            
    return ''.join([idx_to_char[i] for i in output])

In [56]:
params = get_params(num_inputs, num_hiddens, num_outputs)
prefix = '回家'
predict_rnn(prefix, 30, rnn, params, init_rnn_hidden_state, num_hiddens, vocab_size, idx_to_char, char_to_idx)

'回家婆景样忠沙走病掉抬如发桌怀早壶春擅醉样眼o猎蒙残婆景样忠沙'

### 3.5 裁减梯度

In [None]:
def grad_clipping(params, theta):
    norm = torch.tensor(0.0)
    for param in params:
        norm += torch.norm(param.grad, 2)
        
    if norm > theta:
        for param in params:
            param.grad.data.mul_(theta / norm)

In [62]:
torch.norm(torch.FloatTensor([1,2,3]), 2)

tensor(3.7417)

In [64]:
torch.tensor(0.0)

tensor(0.)

### 3.6 训练模型

In [None]:
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, corpus_indices, vocab_size, idx_to_char, char_to_idx, is_random_iter,
                          num_epochs, num_steps, lr, clipping_theta, batch_size, prefixes):
    if is_random_iter:
        data_iter_fn = data_iter_random
    else:
        data_iter_fn = data_iter_consecutive
        
    params = get_params()
    loss =  nn.CrossEntropyLoss()  # 交叉熵损失函数
    start = time.perf_counter()
    for epoch in range(num_epochs):
        if not is_random_iter:  # 初始化相邻抽样的隐藏层状态
            state = init_rnn_state(batch_size, num_hiddens)
            
        l_sum, n = 0.0, 0
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps)  # 抽样字符的下标
        for X, Y in data_iter:  # 取出小批量
            if is_random_iter:  # 如果是随机采样，每个小批量初始化隐藏层状态
                state = init_rnn_state(batch_size, num_hiddens)
            else:  # 如果是相邻采样，则每个小批量取上一个训练最后时刻的隐藏层状态，但s需要从上一个周期计算图中解耦
                for s in state:
                    s.detach_()
                    
            inputs = to_onehot(X, vocab_size)  # 转换成one-hot向量
            (outputs, state) = rnn(inputs, state, params)  # outputs是num_steps个形状为(batch_size, len(vocab))的矩阵
            outputs = torch.cat(outputs, dim=0)  # 将num_steps个矩阵合并成一个矩阵(num_steps * batch_size, len(vocab))
            y = Y.t().reshape((-1,))  # Y的原形状为(batch_size, num_steps), 将其转换为和output一致的数组y
            l = loss(outputs, y.long()).mean()  # 通过交叉熵度量分类错误
            l.backward()  # 反向传播，自动计算梯度
            with torch.no_grad():
                grad_clipping(params, clipping_theta, ctx)  # 裁剪梯度
                for param in params:  # 小批量梯度下降
                    param.data.sub_(lr*param.grad/batch_size)
                    param.grad.data.zero_()
                
            l_sum += l.item() * y.numel()  # 总损失
            n += y.numel()  # 已训练样本数量
            
        if (epoch + 1) % 50 == 0:  # 每50批次运行一次
            print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.perf_counter() - start))
            start = time.perf_counter()  # 重置开始时间
            
        if (epoch + 1) % 100 == 0:  # 预测未来的50个字符
            for prefix in prefixes:
                print(' -',  predict_rnn(prefix, 50, rnn, params, init_rnn_hidden_state, num_hiddens, vocab_size, idx_to_char, char_to_idx))