In [378]:
%matplotlib inline
import math
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l
import time

In [379]:
batch_size, num_steps = 32, 35
train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps)

In [380]:
F.one_hot(torch.tensor([0, 2]), len(vocab))

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

In [381]:
X = torch.arange(10).reshape((2, 5))
F.one_hot(X.T, 28).shape

torch.Size([5, 2, 28])

In [382]:
def get_params(vocab_size, num_hiddens, device):
    num_inputs = num_outputs = vocab_size

    def normal(shape):
        return torch.randn(size=shape, device=device) * 0.01

    # 隐藏层参数
    W_xh = normal((num_inputs, num_hiddens))
    W_hh = normal((num_hiddens, num_hiddens))
    b_h = torch.zeros(num_hiddens, device=device)
    # 输出层参数
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs, device=device)
    # 附加梯度
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.requires_grad_(True)
    return params

In [383]:
def init_rnn_state(batch_size, num_hiddens, device=torch.device('cpu')):
    return (torch.zeros((batch_size, num_hiddens), device=device), )

# state is a tuple!!!
state=init_rnn_state(2, 10, d2l.try_gpu())
#print(type(state))
state

(tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),)

In [384]:
def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    
    for X in inputs: # X.shape: (batch_size, vocab_size)
        # H.shape: (batch_size, hidden_size)
        H = torch.tanh(torch.matmul(H, W_hh) + torch.matmul(X, W_xh) + b_h)
        # Y.shape: (batch_size, vocab_size)
        Y = torch.matmul(H, W_hq) + b_q
        outputs.append(Y)
    #print("torch.cat(outputs, dim=0).shape: ", torch.cat(outputs, dim=0).shape)
    # torch.cat(outputs, dim=0).shape: (sequence_size * batch_size, vocab_size)
    return torch.cat(outputs, dim=0), (H,)

In [385]:
class MyRNN:
    def __init__(self, vocab_size, num_hiddens, device,
                 get_params, init_state, forward_fn):
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens, device)
        self.init_state, self.forward_fn = init_state, forward_fn
    
    def __call__(self, X, state): # X.shape: (batch_size, sequence_size)
        X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        # X.shape: (sequence_size, batch_size, vocab_size)
        return self.forward_fn(X, state, self.params)
    
    def begin_state(self, batch_size, device):
        return self.init_state(batch_size, self.num_hiddens, device)

In [386]:
num_hiddens = 512
net = MyRNN(len(vocab), num_hiddens, d2l.try_gpu(), get_params,
                      init_rnn_state, rnn)
state = net.begin_state(X.shape[0], d2l.try_gpu())
Y, new_state = net(X.to(d2l.try_gpu()), state)
Y.shape, len(new_state), new_state[0].shape

(torch.Size([10, 28]), 1, torch.Size([2, 512]))

In [387]:
def predict_rnn(prefix, num_preds, net, vocab, device):
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]
    get_input = lambda : torch.tensor([outputs[-1]], device=device)
    for y in prefix[1: ]:
        # warm-up exist prefix for hidden state.
        _, state = net(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds):
        y, state = net(get_input(), state)
        outputs.append(y.argmax(dim=1).item())
        #print(_, " is ", y.argmax(dim=1).item())
    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [388]:
predict_rnn('time traveller ', 10, net, vocab, d2l.try_gpu())

'time traveller vpdrssssss'

In [389]:
def grad_clipping(net, theta):
    params=[]
    if isinstance(net, nn.Module):
        params=[p for p in net.parameters() if p.requires_grad]
    else:
        params=net.params

    norm_sum = torch.sqrt(sum(torch.sum(p.grad ** 2) for p in params))
    if norm_sum > theta:
        for param in params:
            param.grad /= norm_sum / theta

In [390]:
def train_epoch_rnn(net, train_iter, loss, optimizer, device, use_random_iter):
    state, start = None, time.time()
    metric = d2l.Accumulator(2)
    for X, y in train_iter:
        if state is None or use_random_iter:
            # if use_random_iter is True, which means for each batch
            # of train data is not sequential, so we couldn't use the
            # previous hidden state for next batch, we have to
            # reinitialize it.
            state=net.begin_state(batch_size=len(X), device=device)
        else:
            # it seems we should split this case into two kinds for
            # GRU and LSTM. but why should it do this, I'm not clear
            # yet.
            if isinstance(net, nn.Module) and not isinstance(state, tuple):
                # state对于nn.GRU是个张量
                state.detach_()
            else:
                # state对于nn.LSTM或对于我们从零开始实现的模型是个张量
                for s in state:
                    s.detach_()
        y = F.one_hot(y.T.reshape(-1), len(vocab))
        X, y = X.to(device), y.to(device)
        y_hat, state = net(X, state)
        y_hat = F.softmax(y_hat, dim=1)
        l = loss(y_hat, y.float()).mean()
        if  isinstance(optimizer, torch.optim.Optimizer):
            optimizer.zero_grad()
            l.backward()
            grad_clipping(net, 1)
            optimizer.step()
        else:
            #print("before: ", net.params)
            l.backward()
            grad_clipping(net, 10)
            # 因为已经调用了mean函数
            optimizer(batch_size=1)
            #print("after: ", net.params)
        metric.add(l * y.numel(), y.numel())
    return math.exp(metric[0] / metric[1]), metric[1] / (time.time() - start)

In [391]:
def train_rnn(net, train_iter, vocab, lr, num_epochs, device,
              use_random_iter=False):
    loss=nn.CrossEntropyLoss()
    if isinstance(net, nn.Module):
        optimizer=torch.optim.SGD(net.parameters(), lr)
    else:
        optimizer = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
    
    predict = lambda prefix: predict_rnn(prefix, 50, net, vocab, device)
    
    for epoch in range(num_epochs):
        ppl, speed = train_epoch_rnn(net, train_iter, loss, optimizer, device, use_random_iter)
        if (epoch + 1) % 10 == 0:
            print(predict('time traveller'))
    print(f'perplexity {ppl:.1f}, {speed:.1f} tokens/sec {str(device)}')
    print(predict('time traveller'))
    print(predict('traveller'))

In [392]:
num_epochs, lr = 30, 1
train_rnn(net, train_iter, vocab, lr, num_epochs, d2l.try_gpu())

time traveller                                                  
time traveller                                                  
time traveller                                                  
perplexity 25.1, 1913769.4 tokens/sec cpu
time traveller                                                  
traveller                                                  
