In [11]:
%matplotlib inline
from d2l import torch as d2l
import math
import torch
from torch import nn
from torch.nn import functional as F

batch_size, num_steps = 32, 35
train_iter , vocab = d2l.load_data_time_machine(batch_size, num_steps)

### One-Hot Encoding

In [12]:
F.one_hot(torch.tensor([0,2]), len(vocab))

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

In [13]:
 # (batch size, number of time steps)
# minibatch的维度[batch size, number of time steps, len(vocab)]
# T之后，变成 [number of time steps, batch size, vocabulary size]
X = torch.arange(10).reshape((2,5))
intputs = F.one_hot(X.T, 28)
print(len(intputs),intputs[0].shape)

5 torch.Size([2, 28])


### initializing the model parameters初始化模型参数

In [14]:
def get_params(vocab_size, num_hiddens, device):
    num_inputs = num_outputs = vocab_size
    def normal(shape):
        return torch.randn(size=shape, device=device)

    # 隐藏层参数
    W_xh = normal((num_inputs, num_hiddens))
    W_hh = normal((num_hiddens, num_hiddens))
    b_h = torch.zeros(num_hiddens, device=device)

    # 输出层参数
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs, device=device)

    # 加上梯度
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.requires_grad_(True)
    return params

### RNN model
$$\mathbf{H}_t = \phi(\mathbf{X}_t \mathbf{W}_{xh} + \mathbf{H}_{t-1} \mathbf{W}_{hh}  + \mathbf{b}_h).$$

$$\mathbf{O}_t = \mathbf{H}_t \mathbf{W}_{hq} + \mathbf{b}_q.$$

In [15]:
# 返回一个元组 返回(batch size, number of hidden units)
def init_rnn_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens), device=device),)


def rnn(inputs, state, params):
    #  inputs 的维度[num_steps,batch_size, vocab_size]
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, =state
    outputs = []
    # X [batch_size, vocab_size]
    for X in inputs:
        H = torch.tanh(torch.mm(X,W_xh) + torch.mm(H, W_hh) + b_h)
        Y = torch.mm(H , W_hq) + b_q
        outputs.append(Y)
    return torch.cat(outputs, dim=0),(H,)
'''
>>> x = torch.randn(2, 3)
>>> x

 0.5983 -0.0341  2.4918
 1.5981 -0.5265 -0.8735
[torch.FloatTensor of size 2x3]

>>> torch.cat((x, x, x), 0)

 0.5983 -0.0341  2.4918
 1.5981 -0.5265 -0.8735
 0.5983 -0.0341  2.4918
 1.5981 -0.5265 -0.8735
 0.5983 -0.0341  2.4918
 1.5981 -0.5265 -0.8735
'''

'\n>>> x = torch.randn(2, 3)\n>>> x\n\n 0.5983 -0.0341  2.4918\n 1.5981 -0.5265 -0.8735\n[torch.FloatTensor of size 2x3]\n\n>>> torch.cat((x, x, x), 0)\n\n 0.5983 -0.0341  2.4918\n 1.5981 -0.5265 -0.8735\n 0.5983 -0.0341  2.4918\n 1.5981 -0.5265 -0.8735\n 0.5983 -0.0341  2.4918\n 1.5981 -0.5265 -0.8735\n'

In [16]:
class RNNModelScratch:
    '''从0开始RNN的实现'''
    def __init__(self, vocab_size, num_hiddens, device, get_params,
                 init_state, forward_fn):
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens, device)
        self.init_state, self.forward_fn = init_state, forward_fn
    def __call__(self, X, state):
        X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        return self.forward_fn(X, state, self.params)

    def begin_state(self, batch_size, device):
        return self.init_state(batch_size, self.num_hiddens, device)

In [17]:
num_hiddens = 512
net = RNNModelScratch(len(vocab), num_hiddens, d2l.try_gpu(), get_params,
                      init_rnn_state, rnn)

state = net.begin_state(X.shape[0], d2l.try_gpu())

Y, new_state = net(X.to(d2l.try_gpu()), state)
X.shape, Y.shape, len(new_state), new_state[0].shape

(torch.Size([2, 5]), torch.Size([10, 28]), 1, torch.Size([2, 512]))

### 预测

In [18]:
def predict_ch8(prefix, num_preds, net , vocab, device):
    # 重置state
    state = net.begin_state(batch_size=1, device= device)
    outputs = [vocab[prefix[0]]]
    get_input = lambda : torch.tensor([outputs[-1]], device=device).reshape(
        (1, 1)
    )
    for y in prefix[1:]: # warm-up period 预热阶段
        _, state = net(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds):
        y, state = net(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1))) #每一行最大的一个

    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [19]:
predict_ch8('time traveller ', 10, net, vocab, d2l.try_gpu())

'time traveller micijw arc'

### 梯度裁剪(Gradient Clipping)
$g  <-min(1, \frac{\theta}{||g||})g$

In [20]:
def grad_clipping(net, theta):
    if isinstance(net, nn.Module):
        params = [p for p in net.parameters() if p.requires_grad]
    else:
        params = net.params
    # L2范式
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm