# 循环神经网络

In [1]:
%matplotlib inline
import torch
import matplotlib.pyplot as plt
from matplotlib_inline import backend_inline
from torch import nn
from torch.nn import functional as F
from typing import Callable
import funcs

backend_inline.set_matplotlib_formats('svg')



隐状态中 $ X_{t}  W_{xh} + H_{t-1}  W_{hh} $的计算，相当于$ X_{t} $与$ H_{t-1} $在列的拼接和$ W_{xh}$与$W_{hh} $在行拼接的乘法。

In [9]:
X, W_xh = torch.normal(0,1,(3,1)), torch.normal(0,1,(1,4))
H, W_hh = torch.normal(0,1,(3,4)), torch.normal(0,1,(4,4))
print(X @ W_xh + H @ W_hh)
print(torch.cat((X, H), dim=1) @ torch.cat((W_xh, W_hh), dim=0))

tensor([[ 0.9002,  1.7100, -3.0632, -0.0411],
        [ 3.8465, -3.9676, -1.7453, -1.4797],
        [-0.5631, -1.5626,  1.3826,  0.4590]])
tensor([[ 0.9002,  1.7100, -3.0632, -0.0411],
        [ 3.8465, -3.9676, -1.7453, -1.4797],
        [-0.5631, -1.5626,  1.3826,  0.4590]])


数据形状X为 (小批量, 时间序列) , 将其做词向量编码时进行转置处理, 将时间序列作为第一轴, 方便按时间访问数据：

In [40]:
X = torch.arange(0,15).reshape((3, 5))
X_hot = F.one_hot(X.T, 28)
print(X.shape, X_hot.shape)
del X, X_hot

torch.Size([3, 5]) torch.Size([5, 3, 28])


## 开始进行循环神经网络的实现, 定义超参数: 

In [2]:
batch_size, num_steps = 32, 35
train_iter, vocab = funcs.load_data_time_machine(
    batch_size, num_steps, token_type="char")
train_iter, vocab

(<TimeMachineLoader with 10000 corpus: "t", "h", "e", " ", "t", ...>,
 <Vocab with 28 tokens: "<unk>", " ", "e", "t", "a", ...>)

## 初始化循环神经网络的模型参数

In [3]:
param_ = Callable[[int, int, str], list]
def get_rnn_params(vocab_size:int, num_hiddens:int, device:str="cuda:0"):
    num_inputs = num_outputs = vocab_size  # 针对one-hot编码，所以维度与字典长度相等
    normal = lambda shape: torch.randn(size=shape, device=device) * 0.01
    # 隐藏层的参数
    # [time_steps, b, embedding] -> [time_steps, b, h]
    W_xh = normal((num_inputs, num_hiddens))
    # [b, h] -> [h, h]
    W_hh = normal((num_hiddens, num_hiddens))
    b_h = torch.zeros(num_hiddens, device=device)
    # 输出层参数
    W_hq = normal((num_hiddens, num_outputs))
    b_q  = torch.zeros(num_outputs, device=device)
    # 赋予梯度
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.requires_grad_(True)
    return params

state_ = Callable[[int, int, str], tuple]
def init_rnn_state(batch_size:int, num_hiddens:int, device:str):
    """初始化隐藏状态"""
    return (torch.zeros((batch_size, num_hiddens), device=device))


$RNN$的输出由隐状态得出:  
- $H_{t} = X_{t}  W_{xh} + H_{t-1}  W_{hh} \\ O_{t} = H_{t} W_{hq} + b_{q}$

In [4]:
forward_ = Callable[[torch.Tensor, torch.Tensor, list], tuple]
def rnn_forward(inputs:torch.Tensor, state:torch.Tensor, params:list):
    """在一个时间步内计算隐藏状态和输出"""
    W_xh, W_hh, b_h, W_hq, b_q = params
    out_puts = []
    # inputs : [time_steps, b, embedding]
    for X in inputs:
        state = torch.tanh(X @ W_xh + state @ W_hh + b_h)
        Y = state @ W_hq + b_q
        out_puts.append(Y)
    return torch.cat(out_puts, dim=0), (state, )


class RNN(nn.Module):
    """循环神经网络的简单实现"""
    def __init__(self, vocab_size:int, num_hiddens:int, device:str, 
                    get_params:param_, init_state:state_, 
                    forward_fn:forward_) -> None:
        super().__init__()
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens, device)
        self.init_state, self.forward_fn = init_state, forward_fn
    
    def forward(self, x:torch.Tensor, state:torch.Tensor):
        x = F.one_hot(x.T, self.vocab_size).to(torch.float32)
        return self.forward_fn(x, state, self.params)
        
    def begin_state(self, batch_size:int, device:str):
        return self.init_state(batch_size, self.num_hiddens, device)
    
    def __repr__(self) -> str:
        out = f"<RNN_Module with {self.num_hiddens} hiddens>"
        return out


检查模型输出形状是否正确

In [9]:
num_hiddens = 512
device = "cuda:0"
rnn = RNN(len(vocab), num_hiddens, device, get_rnn_params, init_rnn_state, rnn_forward)
x, y = next(iter(train_iter))
state = rnn.begin_state(train_iter.batch_size, device)
Y, new_state = rnn(torch.tensor(x).to(device), state)
Y.shape, new_state[0].shape

(torch.Size([1120, 28]), torch.Size([32, 512]))