# RNN 的原理及其手写复现

+ 视频：[29、PyTorch RNN的原理及其手写复现](https://www.bilibili.com/video/BV13i4y1R7jB/)
+ 视频：[30、PyTorch LSTM和LSTMP的原理及其手写复现](https://www.bilibili.com/video/BV1zq4y1m7aH/)
+ [Gated RNN | yubinCloud](https://yubincloud.github.io/notebook/pages/nlp/gated-rnn/)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from loguru import logger
from typing import Tuple

## 1. PyTorch 的使用示例

+ [PyTorch RNN 官方文档](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)

### 1.1 单向、单层 RNN

![RNN 示例](../imgs/rnn.png)


In [2]:
BATCH_SIZE = 2
SEQ_LEN = 4  # 输入序列长度
INPUT_FEATURE_SIZE = 5  # 输入的 feature 大小
HIDDEN_SIZE = 3


single_rnn = nn.RNN(INPUT_FEATURE_SIZE, HIDDEN_SIZE, num_layers=1, batch_first=True)

nn.RNN 的输出：

+ `output`：各个时刻的 hidden state，shape  为 [B, seq_len, num_directions * hidden]
    + 当使用双向时，在 output 最后一维的 num_directions * hidden 元素中，前 hidden 个属于前向 RNN 的结果，后 hidden 个属于反向 RNN 的结果
+ `final_state`：最后一个时刻的最终 hidden state，当只有一层 RNN 时，它也就是 `output` 的最后一个结果

对于 many-to-many 的 task，往往是使用 output，比如词性标注任务；对于 many-to-one 的 task，往往是使用 final_state，比如文本分类任务。

In [3]:
input = torch.randn(BATCH_SIZE, SEQ_LEN, INPUT_FEATURE_SIZE)  # batch_size * seq_len * feature_size
output, final_state = single_rnn(input)
logger.info(f'output:\n{output}')  # [B, seq_len, num_directions * hidden]
logger.info(f'final_state:\n{final_state}')  # [B, num_directions * num_layers, hidden]

2023-01-24 22:32:26.784 | INFO     | __main__:<module>:3 - output:
tensor([[[ 0.5458, -0.3049, -0.2295],
         [ 0.7414, -0.5016,  0.9513],
         [ 0.2096,  0.5185, -0.2949],
         [ 0.5182,  0.0200,  0.2915]],

        [[ 0.2186, -0.3699, -0.4636],
         [ 0.4902, -0.7225,  0.4205],
         [ 0.4375,  0.0294,  0.8530],
         [ 0.7765,  0.1054, -0.9559]]], grad_fn=<TransposeBackward1>)
2023-01-24 22:32:26.787 | INFO     | __main__:<module>:4 - final_state:
tensor([[[ 0.5182,  0.0200,  0.2915],
         [ 0.7765,  0.1054, -0.9559]]], grad_fn=<StackBackward0>)


从上面的结果中可以看出，简单 RNN 的最后时刻 output 就等于最终的 hidden state。

### 1.2 双向、单层 RNN

主要是在实例化 `nn.RNN` 时设置 `bidirectional=True`。

In [4]:
bi_rnn = nn.RNN(INPUT_FEATURE_SIZE, HIDDEN_SIZE, num_layers=1, batch_first=True, bidirectional=True)

In [5]:
output, final_state = bi_rnn(input)
logger.info(f'output:\n{output}')
logger.info(f'output shape: {output.shape}')  # [B, seq_len, num_directions * hidden]
logger.info(f'final_state:\n{final_state}')
logger.info(f'final_state shape: {final_state.shape}')  # [num_directions * num_layers, B, hidden]

2023-01-24 22:32:28.516 | INFO     | __main__:<module>:2 - output:
tensor([[[-6.8374e-03, -4.0665e-01,  4.9632e-01, -7.8123e-01,  3.5904e-01,
          -3.2676e-01],
         [-9.6864e-02,  4.9356e-01,  9.1294e-01, -8.2427e-01, -2.3916e-01,
           4.9574e-01],
         [-1.6656e-01, -7.5501e-01,  3.4672e-04, -4.8054e-01,  4.9172e-01,
          -4.0495e-01],
         [ 7.7775e-01, -1.6314e-02,  2.2688e-01, -8.2076e-01, -5.5325e-01,
          -6.6492e-01]],

        [[-1.6269e-01, -9.5102e-01,  4.3767e-01, -9.4389e-01, -7.0420e-01,
          -4.5849e-01],
         [ 9.1083e-01,  9.5602e-02,  6.9448e-01, -7.2612e-01, -6.6763e-01,
           4.7001e-01],
         [-7.7787e-01, -9.8655e-03,  6.3926e-01,  3.7862e-01,  2.8350e-01,
          -5.2208e-01],
         [-1.5028e-01, -8.3701e-01,  7.5149e-01,  2.6977e-01,  8.1088e-01,
           5.1595e-01]]], grad_fn=<TransposeBackward1>)
2023-01-24 22:32:28.518 | INFO     | __main__:<module>:3 - output shape: torch.Size([2, 4, 6])
2023-01-24 2

## 2. 单层单向 RNN 的逐行实现

$h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})$

In [6]:
# 看一下 PyTorch 中 RNN 的参数：
for k,v in single_rnn.named_parameters():
    print(k, v)

weight_ih_l0 Parameter containing:
tensor([[-0.0663,  0.2330,  0.0046,  0.3284,  0.1327],
        [-0.0404, -0.0758,  0.1053, -0.2830,  0.0785],
        [ 0.0597, -0.4974, -0.3926,  0.1548,  0.4810]], requires_grad=True)
weight_hh_l0 Parameter containing:
tensor([[ 0.4251,  0.3280, -0.1945],
        [ 0.5345,  0.4590,  0.5039],
        [ 0.2841, -0.5721, -0.3805]], requires_grad=True)
bias_ih_l0 Parameter containing:
tensor([ 0.2347, -0.0224, -0.1350], requires_grad=True)
bias_hh_l0 Parameter containing:
tensor([ 0.0991, -0.1741,  0.2732], requires_grad=True)


### 2.1 逐行实现 RNN

In [7]:
# 逐行实现 RNN 的前向传播过程
def rnn_forward(
    input: Tensor,  # [B, T, input_size]
    weight_ih: Tensor,  # [hidden, input_size]
    weight_hh: Tensor,  # [hidden, hidden]
    bias_ih: Tensor,  # [h_dim]
    bias_hh: Tensor,  # [h_dim]
    h_prev: Tensor,  # 前一时刻的 hidden state, [B, hidden]
) -> Tuple[Tensor, Tensor]:
    bs, T, input_size = input.shape
    h_dim = weight_ih.shape[0]  # 这个维度是根据公式来判断的
    
    h_out = torch.zeros(bs, T, h_dim)  # 初始化一个输出状态矩阵
    for t in range(T):
        x = input[:, t, :]  # 获取当前时刻的输入 feature, [bs, input_size]
        x = x.unsqueeze(2)  # [bs, input_size, 1]
        h_prev = h_prev.unsqueeze(2)  # [B, hidden, 1]
        w_ih_batch = weight_ih.unsqueeze(0).tile([bs, 1, 1])  # [bs, h_dim, input_size]
        w_hh_batch = weight_hh.unsqueeze(0).tile([bs, 1, 1])  # [bs, h_dim, h_dim]
        
        w_times_x = torch.bmm(w_ih_batch, x).squeeze(-1)  # [bs, h_dim]
        w_times_h = torch.bmm(w_hh_batch, h_prev).squeeze(-1)  # [bs, h_dim]
        h_prev = torch.tanh(w_times_x + bias_ih + w_times_h + bias_hh)
        
        h_out[:, t, :] = h_prev
    
    return h_out, h_prev.unsqueeze(0)

### 2.2 结果验证

通过与 PyTorch 官方实现的运算结果进行对比，验证 RNN 的实现

In [8]:
# 验证一下 rnn_forward 的正确性
h_prev = torch.zeros(BATCH_SIZE, HIDDEN_SIZE)  # 初始 hidden state
output1, final_state1 = rnn_forward(
    input, 
    single_rnn.weight_ih_l0,
    single_rnn.weight_hh_l0,
    single_rnn.bias_ih_l0,
    single_rnn.bias_hh_l0,
    h_prev
)
output2, final_state2 = single_rnn(input, h_prev.unsqueeze(0))
logger.info(f'自己实现的 RNN 的 output:\n{output1}')
logger.info(f'PyTorch 的 RNN 的 output:\n{output2}')
logger.info(f'自己实现的 RNN 的 final_state:\n{final_state1}')
logger.info(f'PyTorch 的 RNN 的 final_state:\n{final_state2}')

2023-01-24 22:32:30.762 | INFO     | __main__:<module>:12 - 自己实现的 RNN 的 output:
tensor([[[ 0.5458, -0.3049, -0.2295],
         [ 0.7414, -0.5016,  0.9513],
         [ 0.2096,  0.5185, -0.2949],
         [ 0.5182,  0.0200,  0.2915]],

        [[ 0.2186, -0.3699, -0.4636],
         [ 0.4902, -0.7225,  0.4205],
         [ 0.4375,  0.0294,  0.8530],
         [ 0.7765,  0.1054, -0.9559]]], grad_fn=<CopySlices>)
2023-01-24 22:32:30.766 | INFO     | __main__:<module>:13 - PyTorch 的 RNN 的 output:
tensor([[[ 0.5458, -0.3049, -0.2295],
         [ 0.7414, -0.5016,  0.9513],
         [ 0.2096,  0.5185, -0.2949],
         [ 0.5182,  0.0200,  0.2915]],

        [[ 0.2186, -0.3699, -0.4636],
         [ 0.4902, -0.7225,  0.4205],
         [ 0.4375,  0.0294,  0.8530],
         [ 0.7765,  0.1054, -0.9559]]], grad_fn=<TransposeBackward1>)
2023-01-24 22:32:30.769 | INFO     | __main__:<module>:14 - 自己实现的 RNN 的 final_state:
tensor([[[ 0.5182,  0.0200,  0.2915],
         [ 0.7765,  0.1054, -0.9559]]], grad_

## 3. 单层双向 RNN 的逐行实现

### 3.1 逐行实现 RNN

In [9]:
def bidirectional_rnn_forward(
    input: Tensor,
    weight_ih: Tensor,
    weight_hh: Tensor,
    bias_ih: Tensor,
    bias_hh: Tensor,
    h_prev: Tensor,
    weihgt_ih_reverse: Tensor,
    weight_hh_reverse: Tensor,
    bias_ih_reverse: Tensor,
    bias_hh_reverse: Tensor,
    h_prev_reverse: Tensor
) -> Tuple[Tensor, Tensor]:
    NUM_DIRECTIONS = 2  # 表示双向
    bs, T, input_size = input.shape
    h_dim = weight_ih.shape[0]  # 这个维度是根据公式来判断的
    
    forward_output, _ = rnn_forward(input, weight_ih, weight_hh, bias_ih, bias_hh, h_prev)
    reverse_input = input.flip([1])  # 在 dim=1 上进行翻转
    backward_output, _ = rnn_forward(reverse_input, weihgt_ih_reverse, weight_hh_reverse, bias_ih_reverse, bias_hh_reverse, h_prev_reverse)
    
    h_out = torch.zeros(bs, T, h_dim * NUM_DIRECTIONS)  # 初始化一个输出状态矩阵，在最后一维上，前 h_dim 表示前向 RNN 的，后 h_dim 表示反向 RNN 的
    h_out[:, :, :h_dim] = forward_output
    h_out[:, :, h_dim:] = torch.flip(backward_output, [1])
    
    h_n = torch.zeros(bs, NUM_DIRECTIONS, h_dim)
    h_n[:, 0, :] = forward_output[:, -1, :]  # 前向 RNN 的最后时刻的 hidden state
    h_n[:, 1, :] = backward_output[:, -1, :]  # 前向 RNN 的最后时刻的 hidden state
    
    # 为保持与 PyTorch 输出形状一致，对 h_n 进行简单的变换
    h_n = h_n.transpose(0, 1)  # [num_directions, B, h_dim]
    return h_out, h_n

### 3.2 结果验证

In [10]:
# 先看一下 PyTorch 中的参数
for k, v in bi_rnn.named_parameters():
    print(k, v)

weight_ih_l0 Parameter containing:
tensor([[-0.3353, -0.4725,  0.2578,  0.0378, -0.4286],
        [-0.5393, -0.3727,  0.0892,  0.5427,  0.3393],
        [ 0.1132,  0.2473, -0.4087,  0.5298,  0.1923]], requires_grad=True)
weight_hh_l0 Parameter containing:
tensor([[ 0.0203, -0.4756, -0.1851],
        [ 0.3703, -0.0337, -0.0619],
        [-0.1893, -0.1889,  0.1521]], requires_grad=True)
bias_ih_l0 Parameter containing:
tensor([-0.5668, -0.4108, -0.2017], requires_grad=True)
bias_hh_l0 Parameter containing:
tensor([ 0.5474, -0.3007,  0.5214], requires_grad=True)
weight_ih_l0_reverse Parameter containing:
tensor([[-0.2600,  0.4810,  0.0042, -0.3229,  0.2498],
        [-0.3743,  0.5541,  0.3081, -0.0407,  0.2954],
        [-0.0182,  0.2970, -0.2924,  0.5375, -0.1690]], requires_grad=True)
weight_hh_l0_reverse Parameter containing:
tensor([[ 0.5083,  0.5229, -0.5136],
        [-0.4806,  0.4039, -0.3679],
        [-0.2514,  0.4468, -0.4755]], requires_grad=True)
bias_ih_l0_reverse Parameter c

In [11]:
NUM_DIRECTIONS = 2  # 双向
h_prev = torch.zeros(NUM_DIRECTIONS, BATCH_SIZE, HIDDEN_SIZE)
output1, final_state1 = bidirectional_rnn_forward(
    input,
    bi_rnn.weight_ih_l0,
    bi_rnn.weight_hh_l0,
    bi_rnn.bias_ih_l0,
    bi_rnn.bias_hh_l0,
    h_prev[0],
    bi_rnn.weight_ih_l0_reverse,
    bi_rnn.weight_hh_l0_reverse,
    bi_rnn.bias_ih_l0_reverse,
    bi_rnn.bias_hh_l0_reverse,
    h_prev[1]
)
output2, final_state2 = bi_rnn(input, h_prev)

logger.info(f'自己实现的 RNN 的 output:\n{output1}')
logger.info(f'PyTorch 的 RNN 的 output:\n{output2}')
logger.info(f'自己实现的 RNN 的 final_state:\n{final_state1}')
logger.info(f'PyTorch 的 RNN 的 final_state:\n{final_state2}')

2023-01-24 22:32:32.888 | INFO     | __main__:<module>:18 - 自己实现的 RNN 的 output:
tensor([[[-6.8374e-03, -4.0665e-01,  4.9632e-01, -7.8123e-01,  3.5904e-01,
          -3.2676e-01],
         [-9.6864e-02,  4.9356e-01,  9.1294e-01, -8.2427e-01, -2.3916e-01,
           4.9574e-01],
         [-1.6656e-01, -7.5501e-01,  3.4672e-04, -4.8054e-01,  4.9172e-01,
          -4.0495e-01],
         [ 7.7775e-01, -1.6314e-02,  2.2688e-01, -8.2076e-01, -5.5325e-01,
          -6.6492e-01]],

        [[-1.6269e-01, -9.5102e-01,  4.3767e-01, -9.4389e-01, -7.0420e-01,
          -4.5849e-01],
         [ 9.1083e-01,  9.5602e-02,  6.9448e-01, -7.2612e-01, -6.6763e-01,
           4.7001e-01],
         [-7.7787e-01, -9.8655e-03,  6.3926e-01,  3.7862e-01,  2.8350e-01,
          -5.2208e-01],
         [-1.5028e-01, -8.3701e-01,  7.5149e-01,  2.6977e-01,  8.1088e-01,
           5.1595e-01]]], grad_fn=<CopySlices>)
2023-01-24 22:32:32.892 | INFO     | __main__:<module>:19 - PyTorch 的 RNN 的 output:
tensor([[[-6.8374e

## 4. LSTM 手写实现

![LSTM 示意图](../imgs/LSTM.png)

计算公式：

+ 输入门：$i_t = \sigma(W_{ii}x_t + b_{ii} + W_{hi}h_{t-1}+b_{hi})$
+ 遗忘门：$f_t = \sigma(W_{if}x_t + b_{if} + W_{hf}h_{t-1}+b_{hf})$
+ cell 门：$g_t = \tanh(W_{ig}x_t + b_{ig} + W_{hg}h_{t-1}+b_{hg})$
+ 输出门：$o_t = \tanh(W_{io}x_t + b_{io} + W_{ho}h_{t-1}+b_{ho})$
+ 记忆单元的更新：$c_t = f_t \odot c_{t-1} + i_t \odot g_t$
+ 隐藏状态的更新：$h_t = o_t \odot \tanh(c_t)$

### 4.1 PyTorch 官方 API

+ [PyTorch LSTM 官方文档](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)

In [12]:
# 定义常量
BATCH_SIZE = 2
SEQ_LEN = 3
INPUT_SIZE = 4
HIDDEN_SIZE = 5

input = torch.randn(BATCH_SIZE, SEQ_LEN, INPUT_SIZE)  # 输入序列
c0 = torch.randn(BATCH_SIZE, HIDDEN_SIZE)  # 初始记忆单元，不会参与训练
h0 = torch.randn(BATCH_SIZE, HIDDEN_SIZE)  # 初始 hidden state

In [13]:
# 调用官方 API
lstm_layer = nn.LSTM(INPUT_SIZE, HIDDEN_SIZE, batch_first=True)
output, (h_final, c_final) = lstm_layer(
    input,
    (h0.unsqueeze(0), c0.unsqueeze(0))  # 调用 unsqueeze 是因为我们用的是单向的，需要符合 API 的 shape 要求
)
logger.info(f'LSTM 的 output shape: {output.shape}')   # [B, seq_len, h_dim]
logger.info(f'LSTM 的 h_final shape: {h_final.shape}') # [1, B, h_dim]
logger.info(f'LSTM 的 c_final shape: {c_final.shape}') # [1, B, h_dim]

2023-01-24 22:32:33.769 | INFO     | __main__:<module>:7 - LSTM 的 output shape: torch.Size([2, 3, 5])
2023-01-24 22:32:33.771 | INFO     | __main__:<module>:8 - LSTM 的 h_final shape: torch.Size([1, 2, 5])
2023-01-24 22:32:33.772 | INFO     | __main__:<module>:9 - LSTM 的 c_final shape: torch.Size([1, 2, 5])


In [14]:
# 查看官方实现的 LSTM 中的参数
for k, v in lstm_layer.named_parameters():
    print(k, v.shape)

weight_ih_l0 torch.Size([20, 4])
weight_hh_l0 torch.Size([20, 5])
bias_ih_l0 torch.Size([20])
bias_hh_l0 torch.Size([20])


### 4.2 逐行实现 LSTM

+ 输入门：$i_t = \sigma(W_{ii}x_t + b_{ii} + W_{hi}h_{t-1}+b_{hi})$
+ 遗忘门：$f_t = \sigma(W_{if}x_t + b_{if} + W_{hf}h_{t-1}+b_{hf})$
+ cell 门：$g_t = \tanh(W_{ig}x_t + b_{ig} + W_{hg}h_{t-1}+b_{hg})$
+ 输出门：$o_t = \tanh(W_{io}x_t + b_{io} + W_{ho}h_{t-1}+b_{ho})$
+ 记忆单元的更新：$c_t = f_t \odot c_{t-1} + i_t \odot g_t$
+ 隐藏状态的更新：$h_t = o_t \odot \tanh(c_t)$

In [15]:
def lstm_forward(
    input: Tensor,  # [B, seq_len, h_dim]
    initial_states: Tuple[Tensor, Tensor],
    w_ih: Tensor,  # [h_dim*4, input_size]
    w_hh: Tensor,  # [h_dim*4, h_dim]
    b_ih: Tensor,  # [h_dim*4]
    b_hh: Tensor   # [h_dim*4]
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
    h0, c0 = initial_states  # 初始状态
    bs, T, input_size = input.shape
    h_dim = w_ih.shape[0] // 4  # 因为 w_ih 是四个 W 拼接起来的
    
    prev_h = h0  # [bs, h_dim]
    prev_c = c0
    # 对 W 进行一下扩维，方便之后与 x 进行 mini-batch 的运算
    batch_w_ih = w_ih.unsqueeze(0).tile([bs, 1, 1])  # [bs, h_dim*4, input_size]
    batch_w_hh = w_hh.unsqueeze(0).tile([bs, 1, 1])  # [bs, h_dim*4, h_dim]
    
    h_out = torch.zeros(bs, T, h_dim)  # 输出序列
    
    for t in range(T):
        x = input[:, t, :]  # 当前时刻的输入向量, [bs, input_size]
        
        x = x.unsqueeze(-1)  # [bs, input_size, 1]
        w_times_x = torch.bmm(batch_w_ih, x).squeeze(-1)  # [bs, h_dim*4]
        
        prev_h = prev_h.unsqueeze(-1)  # [bs, h_dim, 1]
        w_times_h = torch.bmm(batch_w_hh, prev_h).squeeze(-1)  #[bs. h_dim*4]
        
        # 分别计算输入门(i)、遗忘门(f)、cell 门(g)、输出门(o)
        i_t = torch.sigmoid(
            w_times_x[:, :h_dim] + w_times_h[:, :h_dim] + b_ih[:h_dim] + b_hh[:h_dim]
        )  # 注意都是取前四分之一
        f_t = torch.sigmoid(
            w_times_x[:, h_dim:h_dim*2] + w_times_h[:, h_dim:h_dim*2] + b_ih[h_dim:h_dim*2] + b_hh[h_dim:h_dim*2]
        )
        g_t = torch.tanh(
            w_times_x[:, h_dim*2:h_dim*3] + w_times_h[:, h_dim*2:h_dim*3] + b_ih[h_dim*2:h_dim*3] + b_hh[h_dim*2:h_dim*3]
        )
        o_t = torch.sigmoid(
            w_times_x[:, h_dim*3:] + w_times_h[:, h_dim*3:] + b_ih[h_dim*3:] + b_hh[h_dim*3:]
        )
        # 更新 h 和 c
        prev_c = f_t * prev_c + i_t * g_t
        prev_h = o_t * torch.tanh(prev_c)
        
        h_out[:, t, :] = prev_h
        
    return h_out, (prev_h, prev_c)

### 4.3 结果验证

In [16]:
output1, (h_final1, c_final1) = lstm_forward(
    input,
    (h0, c0),
    lstm_layer.weight_ih_l0,
    lstm_layer.weight_hh_l0,
    lstm_layer.bias_ih_l0,
    lstm_layer.bias_hh_l0
)

output2, (h_final2, c_final2) = lstm_layer(input, (h0.unsqueeze(0), c0.unsqueeze(0)))

logger.info(f'自己实现的 LSTM 的 output:\n{output1}')
logger.info(f'PyTorch 的 LSTM 的 output:\n{output2}')
logger.info(f'自己实现的 LSTM 的 h_final:\n{h_final1}')
logger.info(f'PyTorch 的 LSTM 的 h_final:\n{h_final2}')
logger.info(f'自己实现的 LSTM 的 c_final:\n{c_final1}')
logger.info(f'PyTorch 的 LSTM 的 c_final:\n{c_final2}')

2023-01-24 22:32:34.953 | INFO     | __main__:<module>:12 - 自己实现的 LSTM 的 output:
tensor([[[-0.0311,  0.1824,  0.1643,  0.1683, -0.0137],
         [ 0.0815,  0.1817,  0.4121,  0.0662, -0.1147],
         [ 0.1193,  0.0724,  0.0465, -0.1205,  0.0842]],

        [[-0.5180, -0.0444, -0.1061,  0.0254,  0.1351],
         [-0.1397,  0.0100, -0.0605, -0.0520,  0.2134],
         [-0.0461,  0.0964,  0.0451, -0.1317,  0.1228]]], grad_fn=<CopySlices>)
2023-01-24 22:32:34.957 | INFO     | __main__:<module>:13 - PyTorch 的 LSTM 的 output:
tensor([[[-0.0311,  0.1824,  0.1643,  0.1683, -0.0137],
         [ 0.0815,  0.1817,  0.4121,  0.0662, -0.1147],
         [ 0.1193,  0.0724,  0.0465, -0.1205,  0.0842]],

        [[-0.5180, -0.0444, -0.1061,  0.0254,  0.1351],
         [-0.1397,  0.0100, -0.0605, -0.0520,  0.2134],
         [-0.0461,  0.0964,  0.0451, -0.1317,  0.1228]]],
       grad_fn=<TransposeBackward0>)
2023-01-24 22:32:34.960 | INFO     | __main__:<module>:14 - 自己实现的 LSTM 的 h_final:
tensor([[ 0.1

## 5. LSTMP 手写实现

### 5.1 PyTorch 官方 API

在 PyTorch 的 API 中，只需要在 `nn.LSTM` 实例化时加上一个 `proj_size` 参数即可。

这个 projection 的作用就是对 h_dim 进行压缩。

In [17]:
PROJ_SIZE = 3

proj_lstm_layer = nn.LSTM(INPUT_SIZE, HIDDEN_SIZE, batch_first=True, proj_size=PROJ_SIZE)

for k, v in proj_lstm_layer.named_parameters():
    print(k, v.shape)

weight_ih_l0 torch.Size([20, 4])
weight_hh_l0 torch.Size([20, 3])
bias_ih_l0 torch.Size([20])
bias_hh_l0 torch.Size([20])
weight_hr_l0 torch.Size([3, 5])


查看一下 `proj_lstm_layer` 的参数，可以看到它就是比 `lstm_layer` 多了一个 `weight_hr_l0` 的参数，这个参数就是用来对 hidden state 进行压缩的。

因此现在 hidden state 的大小变成了 3 (PROJ_SIZE)，而不是之前的 5 (HIDDEN_SIZE)。

从运行结果可以看到，只是对 hidden state 进行了压缩，并没有对记忆单元 c 进行压缩。

In [18]:
c0 = torch.randn(BATCH_SIZE, HIDDEN_SIZE)  # c0 的 shape 与之前一样
h0 = torch.randn(BATCH_SIZE, PROJ_SIZE)    # h0 的 shape 由之前的 HIDDEN_SIZE 变成 PROJ_SIZE

output, (h_final, c_final) = proj_lstm_layer(
    input,
    (h0.unsqueeze(0), c0.unsqueeze(0))  # 调用 unsqueeze 是因为我们用的是单向的，需要符合 API 的 shape 要求
)

logger.info(f'LSTMP 的 output shape: {output.shape}')   # [B, seq_len, proj_size]
logger.info(f'LSTMP 的 h_final shape: {h_final.shape}') # [1, B, proj_size]
logger.info(f'LSTMP 的 c_final shape: {c_final.shape}') # [1, B, h_dim]

2023-01-24 22:32:36.477 | INFO     | __main__:<module>:9 - LSTMP 的 output shape: torch.Size([2, 3, 3])
2023-01-24 22:32:36.479 | INFO     | __main__:<module>:10 - LSTMP 的 h_final shape: torch.Size([1, 2, 3])
2023-01-24 22:32:36.480 | INFO     | __main__:<module>:11 - LSTMP 的 c_final shape: torch.Size([1, 2, 5])


### 5.2 逐行实现 LSTMP

这里只需要对 `lstm_forward` 进行简单修改即可实现：

+ 参数中增加一个 `w_hr`，表示 projection，并通过这个参数可以获得 `proj_size`
+ 对 `w_hr` 进行扩维，获得 `batch_w_hr`
+ 最后的输出 `h_out` 的 shape：(bs, T, h_dim) -> (bs, T, proj_size)
+ 在之前计算完 `prev_h` 后，再通过 `w_hr` 对 `prev_h` 进行降维

In [19]:
def proj_lstm_forward(
    input: Tensor,  # [B, seq_len, h_dim]
    initial_states: Tuple[Tensor, Tensor],
    w_ih: Tensor,  # [h_dim*4, input_size]
    w_hh: Tensor,  # [h_dim*4, h_dim]
    b_ih: Tensor,  # [h_dim*4]
    b_hh: Tensor,  # [h_dim*4]
    w_hr: Tensor   # [proj_size, h_dim*4]
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
    h0, c0 = initial_states  # 初始状态
    bs, T, input_size = input.shape
    h_dim = w_ih.shape[0] // 4  # 因为 w_ih 是四个 W 拼接起来的
    proj_size = w_hr.shape[0]
    
    prev_h = h0  # [bs, h_dim]
    prev_c = c0
    # 对 W 进行一下扩维，方便之后与 x 进行 mini-batch 的运算
    batch_w_ih = w_ih.unsqueeze(0).tile([bs, 1, 1])  # [bs, h_dim*4, input_size]
    batch_w_hh = w_hh.unsqueeze(0).tile([bs, 1, 1])  # [bs, h_dim*4, h_dim]
    batch_w_hr = w_hr.unsqueeze(0).tile([bs, 1, 1])  # [bs, proj_size, h_dim]
    
    h_out = torch.zeros(bs, T, proj_size)  # 输出序列
    
    for t in range(T):
        x = input[:, t, :]  # 当前时刻的输入向量, [bs, input_size]
        
        x = x.unsqueeze(-1)  # [bs, input_size, 1]
        w_times_x = torch.bmm(batch_w_ih, x).squeeze(-1)  # [bs, h_dim*4]
        
        prev_h = prev_h.unsqueeze(-1)  # [bs, h_dim, 1]
        w_times_h = torch.bmm(batch_w_hh, prev_h).squeeze(-1)  #[bs. h_dim*4]
        
        # 分别计算输入门(i)、遗忘门(f)、cell 门(g)、输出门(o)
        i_t = torch.sigmoid(
            w_times_x[:, :h_dim] + w_times_h[:, :h_dim] + b_ih[:h_dim] + b_hh[:h_dim]
        )  # 注意都是取前四分之一
        f_t = torch.sigmoid(
            w_times_x[:, h_dim:h_dim*2] + w_times_h[:, h_dim:h_dim*2] + b_ih[h_dim:h_dim*2] + b_hh[h_dim:h_dim*2]
        )
        g_t = torch.tanh(
            w_times_x[:, h_dim*2:h_dim*3] + w_times_h[:, h_dim*2:h_dim*3] + b_ih[h_dim*2:h_dim*3] + b_hh[h_dim*2:h_dim*3]
        )
        o_t = torch.sigmoid(
            w_times_x[:, h_dim*3:] + w_times_h[:, h_dim*3:] + b_ih[h_dim*3:] + b_hh[h_dim*3:]
        )
        # 更新 h 和 c
        prev_c = f_t * prev_c + i_t * g_t
        prev_h = o_t * torch.tanh(prev_c)  # [bs, h_dim]
        
        # 进行 projection
        prev_h.unsqueeze_(-1)  # [bs, h_dim, 1]
        prev_h = torch.bmm(batch_w_hr, prev_h).squeeze(-1)  # [bs, proj_size]
        
        h_out[:, t, :] = prev_h
        
    return h_out, (prev_h, prev_c)

### 5.3 结果验证

In [20]:
output1, (h_final1, c_final1) = proj_lstm_forward(
    input,
    (h0, c0),
    proj_lstm_layer.weight_ih_l0,
    proj_lstm_layer.weight_hh_l0,
    proj_lstm_layer.bias_ih_l0,
    proj_lstm_layer.bias_hh_l0,
    proj_lstm_layer.weight_hr_l0
)

output2, (h_final2, c_final2) = proj_lstm_layer(
    input,
    (h0.unsqueeze(0), c0.unsqueeze(0))  # 调用 unsqueeze 是因为我们用的是单向的，需要符合 API 的 shape 要求
)

logger.info(f'自己实现的 LSTMP 的 output:\n{output1}')
logger.info(f'PyTorch 的 LSTMP 的 output:\n{output2}')
logger.info(f'自己实现的 LSTMP 的 h_final:\n{h_final1}')
logger.info(f'PyTorch 的 LSTMP 的 h_final:\n{h_final2}')
logger.info(f'自己实现的 LSTMP 的 c_final:\n{c_final1}')
logger.info(f'PyTorch 的 LSTMP 的 c_final:\n{c_final2}')

2023-01-24 22:32:37.538 | INFO     | __main__:<module>:16 - 自己实现的 LSTMP 的 output:
tensor([[[-0.0081,  0.0343,  0.1460],
         [ 0.0099, -0.1058,  0.0905],
         [ 0.0034, -0.0762,  0.0565]],

        [[-0.1582, -0.1119,  0.0804],
         [ 0.0805, -0.1801, -0.0130],
         [ 0.1176, -0.1727, -0.0920]]], grad_fn=<CopySlices>)
2023-01-24 22:32:37.541 | INFO     | __main__:<module>:17 - PyTorch 的 LSTMP 的 output:
tensor([[[-0.0081,  0.0343,  0.1460],
         [ 0.0099, -0.1058,  0.0905],
         [ 0.0034, -0.0762,  0.0565]],

        [[-0.1582, -0.1119,  0.0804],
         [ 0.0805, -0.1801, -0.0130],
         [ 0.1176, -0.1727, -0.0920]]], grad_fn=<TransposeBackward0>)
2023-01-24 22:32:37.544 | INFO     | __main__:<module>:18 - 自己实现的 LSTMP 的 h_final:
tensor([[ 0.0034, -0.0762,  0.0565],
        [ 0.1176, -0.1727, -0.0920]], grad_fn=<SqueezeBackward1>)
2023-01-24 22:32:37.546 | INFO     | __main__:<module>:19 - PyTorch 的 LSTMP 的 h_final:
tensor([[[ 0.0034, -0.0762,  0.0565],
     

## 6. GRU 手写实现

+ 视频：[31、PyTorch GRU的原理及其手写复现](https://www.bilibili.com/video/BV1jm4y1Q7uh/)
+ [PyTorch GRU 官方文档](https://pytorch.org/docs/stable/generated/torch.nn.GRU.html)

相比于 LSTM，GRU 只有一个初始状态 `h`，而且参数量也少于 LSTM（通过模型内部的计算公式可以看出）。

关于在 PyTorch 中如何计算 model 的参数量，可以参考 [num_of_params.py](https://gist.github.com/yubinCloud/3e09dd71437d1ecfefbd54250a029da1)。

GRU 的计算公式：

+ 重置门：$r_t = \sigma(W_{ir}x_t + b_{ir} + W_{hr}h_{t-1}+b_{hr})$
+ 更新门：$z_t = \sigma(W_{iz}x_t + b_{iz} + W_{hz}h_{t-1}+b_{hz})$
+ 候选状态：$n_t = \tanh(W_{in}x_t + b_{in} + r_t * (W_{hn}h_{(t-1)} + b_{hn}))$
+ 隐藏状态的增量更新：$h_t = (1 - z_t) * n_t + z_t * h_{(t-1)}$


In [21]:
# 定义常量
BATCH_SIZE = 2
SEQ_LEN = 3
INPUT_SIZE = 4
HIDDEN_SIZE = 5

input = torch.randn(BATCH_SIZE, SEQ_LEN, INPUT_SIZE)  # 输入序列
h0 = torch.randn(BATCH_SIZE, HIDDEN_SIZE)  # 初始值，不需要训练

### 6.1 PyTorch 的官方 API


In [22]:
gru_layer = nn.GRU(INPUT_SIZE, HIDDEN_SIZE, batch_first=True)
output, h_final = gru_layer(input, h0.unsqueeze(0))

logger.info(f'GRU 的 output shape: {output.shape}')  # [bs, seq_len, h_dim]
logger.info(f'GRU 的 h_final shape: {h_final.shape}') # [1, bs, h_dim]

2023-01-24 22:32:38.655 | INFO     | __main__:<module>:4 - GRU 的 output shape: torch.Size([2, 3, 5])
2023-01-24 22:32:38.656 | INFO     | __main__:<module>:5 - GRU 的 h_final shape: torch.Size([1, 2, 5])


In [23]:
for k, v in gru_layer.named_parameters():
    print(k, v.shape)

weight_ih_l0 torch.Size([15, 4])
weight_hh_l0 torch.Size([15, 5])
bias_ih_l0 torch.Size([15])
bias_hh_l0 torch.Size([15])


### 6.2 逐行实现 GRU

GRU 的计算公式：

+ 重置门：$r_t = \sigma(W_{ir}x_t + b_{ir} + W_{hr}h_{t-1}+b_{hr})$
+ 更新门：$z_t = \sigma(W_{iz}x_t + b_{iz} + W_{hz}h_{t-1}+b_{hz})$
+ 候选状态：$n_t = \tanh(W_{in}x_t + b_{in} + r_t * (W_{hn}h_{(t-1)} + b_{hn}))$
+ 隐藏状态的增量更新：$h_t = (1 - z_t) * n_t + z_t * h_{(t-1)}$

In [24]:
def gru_forward(
    input: Tensor,  # [bs, seq_len, input_size]
    initial_states: Tensor,
    w_ih: Tensor,  # [h_dim*3, input_size]
    w_hh: Tensor,
    b_ih: Tensor,
    b_hh: Tensor
):
    prev_h = initial_states
    bs, T, input_size = input.shape
    h_dim = w_ih.shape[0] // 3
    
    # 对权重扩维，复制成 batch_size 倍
    batch_w_ih = w_ih.unsqueeze(0).tile([bs, 1, 1])
    batch_w_hh = w_hh.unsqueeze(0).tile([bs, 1, 1])
    
    h_out = torch.zeros(bs, T, h_dim)  # GRU 网络的输出状态序列
    
    for t in range(T):
        x = input[:, t, :]  # 此时 GRU cell 的输入 feature vector, [bs, input_size]
        
        w_times_x = torch.bmm(batch_w_ih, x.unsqueeze(-1)).squeeze(-1)  # [bs, 3*h_dim]
        w_times_h = torch.bmm(batch_w_hh, prev_h.unsqueeze(-1)).squeeze(-1)  # [bs, 3*h_dim]
        
        # 计算重置门和更新门
        r_t = torch.sigmoid(
            w_times_x[:, :h_dim] + w_times_h[:, :h_dim] + b_ih[:h_dim] + b_hh[:h_dim]
        )
        z_t = torch.sigmoid(
            w_times_x[:, h_dim:h_dim*2] + w_times_h[:, h_dim:h_dim*2] + b_ih[h_dim:h_dim*2] + b_hh[h_dim:h_dim*2]
        )
        # 计算候选状态
        n_t = torch.tanh(
            w_times_x[:, 2*h_dim:3*h_dim] + b_ih[2*h_dim:3*h_dim] + r_t * (w_times_h[:, 2*h_dim:3*h_dim] + b_hh[2*h_dim:3*h_dim])
        )
        # 增量更新得到当前时刻的最新隐藏状态
        prev_h = (1 - z_t) * n_t + z_t * prev_h
        
        h_out[:, t, :] = prev_h
    
    return h_out, prev_h

### 6.3 结果验证

In [25]:
# 调用自定义的 GRU
output1, h_final1 = gru_forward(
    input,
    h0,
    gru_layer.weight_ih_l0,
    gru_layer.weight_hh_l0,
    gru_layer.bias_ih_l0,
    gru_layer.bias_hh_l0
)

# 调用 PyTorch 官网 API
output2, h_final2 = gru_layer(input, h0.unsqueeze(0))

logger.info(f'自己实现的 GRU 的 output:\n{output1}')
logger.info(f'PyTorch 的 GRU 的 output:\n{output2}')
logger.info(f'自己实现的 GRU 的 h_final:\n{h_final1}')
logger.info(f'PyTorch 的 GRU 的 h_final:\n{h_final2}')

2023-01-24 22:32:39.890 | INFO     | __main__:<module>:14 - 自己实现的 GRU 的 output:
tensor([[[ 0.3792, -0.0641, -0.4533,  0.1585, -0.4716],
         [ 0.4597, -0.0958,  0.2241, -0.0230, -0.2573],
         [ 0.3011,  0.0567, -0.5498,  0.1720,  0.3509]],

        [[ 0.8877,  0.5805, -0.5134,  0.0334,  0.0081],
         [ 0.7638,  0.6401, -0.5307,  0.2876, -0.0665],
         [ 0.5887,  0.2752, -0.6436, -0.2113,  0.2936]]], grad_fn=<CopySlices>)
2023-01-24 22:32:39.894 | INFO     | __main__:<module>:15 - PyTorch 的 GRU 的 output:
tensor([[[ 0.3792, -0.0641, -0.4533,  0.1585, -0.4716],
         [ 0.4597, -0.0958,  0.2241, -0.0230, -0.2573],
         [ 0.3011,  0.0567, -0.5498,  0.1720,  0.3509]],

        [[ 0.8877,  0.5805, -0.5134,  0.0334,  0.0081],
         [ 0.7638,  0.6401, -0.5307,  0.2876, -0.0665],
         [ 0.5887,  0.2752, -0.6436, -0.2113,  0.2936]]],
       grad_fn=<TransposeBackward1>)
2023-01-24 22:32:39.897 | INFO     | __main__:<module>:16 - 自己实现的 GRU 的 h_final:
tensor([[ 0.3011