In [3]:
import torch
from d2l import torch as d2l

In [None]:
class RNN:
    def __init__(self, vocab_size, num_hiddens):
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = self.init_params()
        
    def init_params(self):
        num_inputs = num_outputs = self.vocab_size; num_hiddens = self.num_hiddens
        def normal(shape):
            return torch.randn(size=shape, device=device) * 0.01
        def three():
            return (normal((num_inputs, num_hiddens)), normal((num_hiddens, num_hiddens)), torch.zeros(num_hiddens, device=device))
        W_xi, W_hi, b_i = three(); W_xf, W_hf, b_f = three(); W_xo, W_ho, b_o = three(); W_xc, W_hc, b_c = three();
        W_hq, b_q = normal((num_hiddens, num_outputs)), torch.zeros(num_outputs, device=device)
        params = [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q]
        for param in params:
            param.requires_grad_(True)
        return params
    
    def init_state(self, batch_size):
        return (torch.zeros((batch_size, self.num_hiddens), device=device), torch.zeros((batch_size, self.num_hiddens), device=device))
    
    def forward(self, inputs, state):
        C, H = state; outputs = []
        W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q = self.params
        for X in inputs:
            I = torch.sigmoid(X @ W_xi + H @ W_hi + b_i)
            F = torch.sigmoid(X @ W_xf + H @ W_hf + b_f)
            O = torch.sigmoid(X @ W_xo + H @ W_ho + b_o)
            C_hat = torch.tanh(X @ W_xc + H @ W_hc + b_c)
            C = F * C + I * C_hat
            H = O * torch.tanh(C)
            Y = H @ W_hq + b_q
            outputs.append(Y)
        return torch.cat(outputs, dim=0), (C, H)
    
    def __call__(self, X, state):
        inputs = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        return self.forward(inputs, state)
    
    def grad_clipping(self, theta):
        norm = torch.sqrt(sum([torch.sum(p) for p in self.params]))
        if norm > theta:
            for p in self.params:
                p.grad[:] *= theta / norm

In [4]:
def get_params(vocab_size, num_hiddens, device):
    """获取参数"""
    num_inputs = num_outputs = vocab_size
    def normal(shape):
        return randn(size=shape, device=device) * 0.01
    W_xh = normal((num_inputs, num_hiddens))
    W_hh = normal((num_hiddens, num_hiddens))
    b_h = zeros(num_hiddens, device=device, requires_grad=True)
    W_hq = normal((num_hiddens, num_outputs))
    b_q = zeros(num_outputs, device=device, requires_grad=True)
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    
    for param in params:
        param.requires_grad_(True)
    return params

In [5]:
def init_rnn_state(batch_size, num_hiddens, device):
    """初始化隐藏状态"""
    return (zeros((batch_size, num_hiddens), device=device), )

In [6]:
def rnn(inputs, state, params):
    """
        forward 函数
        inputs: step * batch_size * vocab_size
    """
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    for X in inputs:
        # X: batch_size * vocab_size
        # H: batch_size * num_hiddens
        # W_xh: vocab_size * num_hiddens
        # W_hh: num_hiddens * num_hiddens
        H = tanh(mm(X, W_xh) + mm(H, W_hh) + b_h)
        # W_hq: num_hiddens * vocab_size
        # Y: batch_size * vocab_size
        Y = mm(H, W_hq) + b_q
        outputs.append(Y)
    # dim=0: 按行拼接
    # torch.cat(outputs, dim=0): (num_steps * batch_size) * vocab_size
    return torch.cat(outputs, dim=0), (H, )

In [7]:
class RNN:
    def __init__(self, vocab_size, num_hiddens, device, get_params, init_state, forward_fn):
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens, device)
        self.init_state, self.forward_fn = init_state, forward_fn
    
    def __call__(self, X, state):
        """像函数一样被调用 RNN(X)"""
        # X: batch_size * num_steps [string]
        inputs = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        # inputs: num_steps * batch_size * vocab_size
        return self.forward_fn(inputs, state, self.params)
    
    def begin_state(self, batch_size, device):
        return self.init_state(batch_size, self.num_hiddens, device)

In [8]:
def grad_clipping(net, theta):
    """梯度剪裁"""
    params = net.params
    norm = torch.sqrt(sum(torch.sum(p.grad**2) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [9]:
def train_epochs(net, train_iter, loss, updater, device, use_random_iter):
    state = None
    metric = d2l.Accumulator(2)
    for X, Y in train_iter:
        if state is None or use_random_iter:
            state = net.begin_state(batch_size=X.shape[0], device=device)
        else:
            for s in state: s.detach_()
        y_hat, state = net(X, state)
        y = Y.T.reshape(-1)
        l = loss(y_hat, y.long()).mean()
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(net, 1)
            updater.step()
        else:
            l.backward()
            grad_clipping(net, 1)
            updater(batch_size=1)
        metric.add(l * y.numel(), y.numel())
    return math.exp(metric[0] / metric[1])

In [10]:
def train(net, train_iter, vocab, lr, num_epochs, device, use_random_iter=False):
    loss = nn.CrossEntropyLoss()
    if isinstance(net, nn.Module):
        updater = torch.optim.SGD(net.parameters(), lr)
    else:
        updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
    for epoch in tqdm(range(num_epochs), ncols=100):
        ppl = train_epochs(net, train_iter, loss, updater, device, use_random_iter)
    print('困惑度: %f' % ppl)

In [11]:
def seq_data_iter_random(corpus, batch_size, num_steps):  #@save
    """使用随机抽样生成一个小批量子序列"""
    # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
    corpus = corpus[random.randint(0, num_steps - 1):]
    # 减去1，是因为我们需要考虑标签
    num_subseqs = (len(corpus) - 1) // num_steps
    # 长度为num_steps的子序列的起始索引
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # 在随机抽样的迭代过程中，
    # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
    random.shuffle(initial_indices)

    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return corpus[pos: pos + num_steps]

    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        # 在这里，initial_indices包含子序列的随机起始索引
        initial_indices_per_batch = initial_indices[i: i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)
        
def seq_data_iter_sequential(corpus, batch_size, num_steps):  #@save
    """使用顺序分区生成一个小批量子序列"""
    # 从随机偏移量开始划分序列
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    Xs = torch.tensor(corpus[offset: offset + num_tokens])
    Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1] // num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i: i + num_steps]
        Y = Ys[:, i: i + num_steps]
        yield X, Y

In [12]:
def tokenize(lines, token='word'): 
    """将文本行拆分为单词或字符词元"""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误：未知词元类型：' + token)

def load_corpus_vocab(max_tokens=-1): 
    """词元索引列表和词表"""
    lines = texts
    tokens = d2l.tokenize(lines)
    vocab = d2l.Vocab(tokens)
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens:
        corpus = corpus[:max_tokens]
    return corpus, vocab

class SeqDataLoader: 
    """加载序列数据的迭代器"""
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential
        self.corpus, self.vocab = load_corpus_vocab(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
    
def load_data(batch_size, num_steps,  #@save
                           use_random_iter=False, max_tokens=10000):
    """返回迭代器和词表"""
    data_iter = SeqDataLoader(
        batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab

In [13]:
batch_size, num_steps = 32, 5
texts = ['I am a cat'] * 1000
train_iter, vocab = load_data(batch_size, num_steps, max_tokens=10000)

In [14]:
num_epochs, lr = 10, 1
net = RNN(len(vocab), num_hiddens, device, get_params, init_rnn_state, rnn)
train(net, train_iter, vocab, lr, num_epochs, device)

NameError: name 'num_hiddens' is not defined

In [201]:
def predict(prefix, num_preds, net, vocab, device):
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]
    get_input = lambda: tensor([outputs[-1]], device=device).reshape(1, 1)
    for y in prefix[1:]:
        _, state = net(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds):
        y, state = net(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1, 1)))
    return ' '.join([vocab.idx_to_token[i] for i in outputs])

In [202]:
predict('I am a'.split(' '), 1, net, vocab, device)

'I am a cat'