# RNN 的原理及其手写复现

+ 视频：[29、PyTorch RNN的原理及其手写复现](https://www.bilibili.com/video/BV13i4y1R7jB/)
+ [PyTorch RNN 官方文档](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)

![RNN 示意图](../imgs/rnn.png)

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from loguru import logger

## 1. PyTorch 的使用示例

### 1.1 单向、单层 RNN

In [20]:
BATCH_SIZE = 2
SEQ_LEN = 4  # 输入序列长度
INPUT_FEATURE_SIZE = 5  # 输入的 feature 大小
HIDDEN_SIZE = 3


single_rnn = nn.RNN(INPUT_FEATURE_SIZE, HIDDEN_SIZE, num_layers=1, batch_first=True)

nn.RNN 的输出：

+ `output`：各个时刻的 hidden state，shape  为 [B, seq_len, num_directions * hidden]
    + 当使用双向时，在 output 最后一维的 num_directions * hidden 元素中，前 hidden 个属于前向 RNN 的结果，后 hidden 个属于反向 RNN 的结果
+ `final_state`：最后一个时刻的最终 hidden state，当只有一层 RNN 时，它也就是 `output` 的最后一个结果

In [21]:
input = torch.randn(BATCH_SIZE, SEQ_LEN, INPUT_FEATURE_SIZE)  # batch_size * seq_len * feature_size
output, final_state = single_rnn(input)
logger.info(f'output:\n{output}')  # [B, seq_len, num_directions * hidden]
logger.info(f'final_state:\n{final_state}')  # [B, num_directions * num_layers, hidden]

2023-01-24 17:29:39.594 | INFO     | __main__:<module>:3 - output:
tensor([[[ 0.7509,  0.5184, -0.6673],
         [ 0.7321, -0.1399,  0.8266],
         [ 0.7061,  0.6471, -0.2703],
         [-0.6831,  0.5324, -0.1234]],

        [[ 0.4595, -0.2751, -0.8516],
         [ 0.0469,  0.2887,  0.7436],
         [ 0.3225,  0.2615,  0.6178],
         [ 0.0899,  0.9116,  0.8549]]], grad_fn=<TransposeBackward1>)
2023-01-24 17:29:39.596 | INFO     | __main__:<module>:4 - h_n:
tensor([[[-0.6831,  0.5324, -0.1234],
         [ 0.0899,  0.9116,  0.8549]]], grad_fn=<StackBackward0>)


从上面的结果中可以看出，简单 RNN 的最后时刻 output 就等于最终的 hidden state。

### 1.2 双向、单层 RNN

主要是在实例化 `nn.RNN` 时设置 `bidirectional=True`。

In [22]:
bi_rnn = nn.RNN(INPUT_FEATURE_SIZE, HIDDEN_SIZE, num_layers=1, batch_first=True, bidirectional=True)

In [23]:
output, final_state = bi_rnn(input)
logger.info(f'output:\n{output}')
logger.info(f'output shape: {output.shape}')  # [B, seq_len, num_directions * hidden]
logger.info(f'final_state:\n{final_state}')
logger.info(f'final_state shape: {final_state.shape}')  # [num_directions * num_layers, B, hidden]

2023-01-24 17:29:40.920 | INFO     | __main__:<module>:2 - output:
tensor([[[ 0.6008,  0.5499, -0.3351, -0.9113, -0.9562, -0.5820],
         [ 0.5309,  0.5310, -0.5073, -0.1493, -0.9351, -0.7374],
         [ 0.5353,  0.5634, -0.1619, -0.7628, -0.9164,  0.3288],
         [ 0.9418,  0.5155, -0.0323,  0.9042, -0.3992,  0.4058]],

        [[ 0.9219, -0.2853, -0.4261, -0.6801, -0.6480, -0.0786],
         [-0.3604,  0.9525,  0.2256, -0.6828, -0.8556, -0.4593],
         [ 0.5215,  0.5452, -0.0941,  0.8262, -0.7671, -0.5255],
         [-0.0206,  0.9360, -0.9212, -0.0835,  0.2962, -0.9106]]],
       grad_fn=<TransposeBackward1>)
2023-01-24 17:29:40.922 | INFO     | __main__:<module>:3 - output shape: torch.Size([2, 4, 6])
2023-01-24 17:29:40.925 | INFO     | __main__:<module>:4 - h_n:
tensor([[[ 0.9418,  0.5155, -0.0323],
         [-0.0206,  0.9360, -0.9212]],

        [[-0.9113, -0.9562, -0.5820],
         [-0.6801, -0.6480, -0.0786]]], grad_fn=<StackBackward0>)
2023-01-24 17:29:40.926 | INFO 

## 2. 单层单向 RNN 的逐行实现

$h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})$

In [24]:
# 看一下 PyTorch 中 RNN 的参数：
for k,v in single_rnn.named_parameters():
    print(k, v)

weight_ih_l0 Parameter containing:
tensor([[-0.2482, -0.0345,  0.1997, -0.2403,  0.4789],
        [-0.0277,  0.1436, -0.5319, -0.0487,  0.0538],
        [-0.5599,  0.5040, -0.0793,  0.3747,  0.0448]], requires_grad=True)
weight_hh_l0 Parameter containing:
tensor([[-0.3301,  0.0958, -0.0544],
        [ 0.3288, -0.3701,  0.3510],
        [ 0.4919,  0.2924, -0.4417]], requires_grad=True)
bias_ih_l0 Parameter containing:
tensor([ 0.4063,  0.1175, -0.0897], requires_grad=True)
bias_hh_l0 Parameter containing:
tensor([-0.2630,  0.1842, -0.0850], requires_grad=True)


### 2.1 逐行实现 RNN

In [25]:
# 逐行实现 RNN 的前向传播过程
def rnn_forward(
    input: Tensor,  # [B, T, input_size]
    weight_ih: Tensor,  # [hidden, input_size]
    weight_hh: Tensor,  # [hidden, hidden]
    bias_ih: Tensor,  # [h_dim]
    bias_hh: Tensor,  # [h_dim]
    h_prev: Tensor,  # 前一时刻的 hidden state, [B, hidden]
):
    bs, T, input_size = input.shape
    h_dim = weight_ih.shape[0]  # 这个维度是根据公式来判断的
    
    h_out = torch.zeros(bs, T, h_dim)  # 初始化一个输出状态矩阵
    for t in range(T):
        x = input[:, t, :]  # 获取当前时刻的输入 feature, [bs, input_size]
        x = x.unsqueeze(2)  # [bs, input_size, 1]
        h_prev = h_prev.unsqueeze(2)  # [B, hidden, 1]
        w_ih_batch = weight_ih.unsqueeze(0).tile([bs, 1, 1])  # [bs, h_dim, input_size]
        w_hh_batch = weight_hh.unsqueeze(0).tile([bs, 1, 1])  # [bs, h_dim, h_dim]
        
        w_times_x = torch.bmm(w_ih_batch, x).squeeze(-1)  # [bs, h_dim]
        w_times_h = torch.bmm(w_hh_batch, h_prev).squeeze(-1)  # [bs, h_dim]
        h_prev = torch.tanh(w_times_x + bias_ih + w_times_h + bias_hh)
        
        h_out[:, t, :] = h_prev
    
    return h_out, h_prev.unsqueeze(0)

### 2.2 结果验证

通过与 PyTorch 官方实现的运算结果进行对比，验证 RNN 的实现

In [26]:
# 验证一下 rnn_forward 的正确性
h_prev = torch.zeros(BATCH_SIZE, HIDDEN_SIZE)  # 初始 hidden state
output1, final_state1 = rnn_forward(
    input, 
    single_rnn.weight_ih_l0,
    single_rnn.weight_hh_l0,
    single_rnn.bias_ih_l0,
    single_rnn.bias_hh_l0,
    h_prev
)
output2, final_state2 = single_rnn(input, h_prev.unsqueeze(0))
logger.info(f'自己实现的 RNN 的 output:\n{output1}')
logger.info(f'PyTorch 的 RNN 的 output:\n{output2}')
logger.info(f'自己实现的 RNN 的 final_state:\n{final_state1}')
logger.info(f'PyTorch 的 RNN 的 final_state:\n{final_state2}')

2023-01-24 17:29:43.528 | INFO     | __main__:<module>:12 - 自己实现的 RNN 的 output:
tensor([[[ 0.7509,  0.5184, -0.6673],
         [ 0.7321, -0.1399,  0.8266],
         [ 0.7061,  0.6471, -0.2703],
         [-0.6831,  0.5324, -0.1234]],

        [[ 0.4595, -0.2751, -0.8516],
         [ 0.0469,  0.2887,  0.7436],
         [ 0.3225,  0.2615,  0.6178],
         [ 0.0899,  0.9116,  0.8549]]], grad_fn=<CopySlices>)
2023-01-24 17:29:43.532 | INFO     | __main__:<module>:13 - PyTorch 的 RNN 的 output:
tensor([[[ 0.7509,  0.5184, -0.6673],
         [ 0.7321, -0.1399,  0.8266],
         [ 0.7061,  0.6471, -0.2703],
         [-0.6831,  0.5324, -0.1234]],

        [[ 0.4595, -0.2751, -0.8516],
         [ 0.0469,  0.2887,  0.7436],
         [ 0.3225,  0.2615,  0.6178],
         [ 0.0899,  0.9116,  0.8549]]], grad_fn=<TransposeBackward1>)
2023-01-24 17:29:43.535 | INFO     | __main__:<module>:14 - 自己实现的 RNN 的 final_state:
tensor([[[-0.6831,  0.5324, -0.1234],
         [ 0.0899,  0.9116,  0.8549]]], grad_

## 3. 单层双向 RNN 的逐行实现

### 3.1 逐行实现 RNN

In [27]:
def bidirectional_rnn_forward(
    input: Tensor,
    weight_ih: Tensor,
    weight_hh: Tensor,
    bias_ih: Tensor,
    bias_hh: Tensor,
    h_prev: Tensor,
    weihgt_ih_reverse: Tensor,
    weight_hh_reverse: Tensor,
    bias_ih_reverse: Tensor,
    bias_hh_reverse: Tensor,
    h_prev_reverse: Tensor
):
    NUM_DIRECTIONS = 2  # 表示双向
    bs, T, input_size = input.shape
    h_dim = weight_ih.shape[0]  # 这个维度是根据公式来判断的
    
    forward_output, _ = rnn_forward(input, weight_ih, weight_hh, bias_ih, bias_hh, h_prev)
    reverse_input = input.flip([1])  # 在 dim=1 上进行翻转
    backward_output, _ = rnn_forward(reverse_input, weihgt_ih_reverse, weight_hh_reverse, bias_ih_reverse, bias_hh_reverse, h_prev_reverse)
    
    h_out = torch.zeros(bs, T, h_dim * NUM_DIRECTIONS)  # 初始化一个输出状态矩阵，在最后一维上，前 h_dim 表示前向 RNN 的，后 h_dim 表示反向 RNN 的
    h_out[:, :, :h_dim] = forward_output
    h_out[:, :, h_dim:] = torch.flip(backward_output, [1])
    
    h_n = torch.zeros(bs, NUM_DIRECTIONS, h_dim)
    h_n[:, 0, :] = forward_output[:, -1, :]  # 前向 RNN 的最后时刻的 hidden state
    h_n[:, 1, :] = backward_output[:, -1, :]  # 前向 RNN 的最后时刻的 hidden state
    
    # 为保持与 PyTorch 输出形状一致，对 h_n 进行简单的变换
    h_n = h_n.transpose(0, 1)  # [num_directions, B, h_dim]
    return h_out, h_n

### 3.2 结果验证

In [28]:
# 先看一下 PyTorch 中的参数
for k, v in bi_rnn.named_parameters():
    print(k, v)

weight_ih_l0 Parameter containing:
tensor([[ 0.5102, -0.3869, -0.0193, -0.0843, -0.0105],
        [-0.2907,  0.3793, -0.2157,  0.0909, -0.1167],
        [ 0.2057,  0.1608,  0.5355, -0.1843, -0.1846]], requires_grad=True)
weight_hh_l0 Parameter containing:
tensor([[-0.2516,  0.4806, -0.0364],
        [ 0.5271, -0.3576,  0.0826],
        [ 0.4867,  0.0360,  0.4172]], requires_grad=True)
bias_ih_l0 Parameter containing:
tensor([ 0.4573,  0.2591, -0.2537], requires_grad=True)
bias_hh_l0 Parameter containing:
tensor([ 0.3143,  0.4186, -0.1830], requires_grad=True)
weight_ih_l0_reverse Parameter containing:
tensor([[ 0.3595,  0.4310,  0.0465,  0.3054,  0.0413],
        [ 0.0251, -0.4296, -0.4924,  0.4548, -0.1660],
        [ 0.4237, -0.2697,  0.3554, -0.1753, -0.0984]], requires_grad=True)
weight_hh_l0_reverse Parameter containing:
tensor([[-0.2429,  0.4347, -0.0359],
        [ 0.4497,  0.5165, -0.2733],
        [ 0.4508,  0.4567,  0.3245]], requires_grad=True)
bias_ih_l0_reverse Parameter c

In [29]:
NUM_DIRECTIONS = 2  # 双向
h_prev = torch.zeros(NUM_DIRECTIONS, BATCH_SIZE, HIDDEN_SIZE)
output1, final_state1 = bidirectional_rnn_forward(
    input,
    bi_rnn.weight_ih_l0,
    bi_rnn.weight_hh_l0,
    bi_rnn.bias_ih_l0,
    bi_rnn.bias_hh_l0,
    h_prev[0],
    bi_rnn.weight_ih_l0_reverse,
    bi_rnn.weight_hh_l0_reverse,
    bi_rnn.bias_ih_l0_reverse,
    bi_rnn.bias_hh_l0_reverse,
    h_prev[1]
)
output2, final_state2 = bi_rnn(input, h_prev)

logger.info(f'自己实现的 RNN 的 output:\n{output1}')
logger.info(f'PyTorch 的 RNN 的 output:\n{output2}')
logger.info(f'自己实现的 RNN 的 final_state:\n{final_state1}')
logger.info(f'PyTorch 的 RNN 的 final_state:\n{final_state2}')

2023-01-24 17:29:45.431 | INFO     | __main__:<module>:18 - 自己实现的 RNN 的 output:
tensor([[[ 0.6008,  0.5499, -0.3351, -0.9113, -0.9562, -0.5820],
         [ 0.5309,  0.5310, -0.5073, -0.1493, -0.9351, -0.7374],
         [ 0.5353,  0.5634, -0.1619, -0.7628, -0.9164,  0.3288],
         [ 0.9418,  0.5155, -0.0323,  0.9042, -0.3992,  0.4058]],

        [[ 0.9219, -0.2853, -0.4261, -0.6801, -0.6480, -0.0786],
         [-0.3604,  0.9525,  0.2256, -0.6828, -0.8556, -0.4593],
         [ 0.5215,  0.5452, -0.0941,  0.8262, -0.7671, -0.5255],
         [-0.0206,  0.9360, -0.9212, -0.0835,  0.2962, -0.9106]]],
       grad_fn=<CopySlices>)
2023-01-24 17:29:45.434 | INFO     | __main__:<module>:19 - PyTorch 的 RNN 的 output:
tensor([[[ 0.6008,  0.5499, -0.3351, -0.9113, -0.9562, -0.5820],
         [ 0.5309,  0.5310, -0.5073, -0.1493, -0.9351, -0.7374],
         [ 0.5353,  0.5634, -0.1619, -0.7628, -0.9164,  0.3288],
         [ 0.9418,  0.5155, -0.0323,  0.9042, -0.3992,  0.4058]],

        [[ 0.9219, -0