# RNN 简介

# 时间序列模型概念简介
简介时序模型的不同之处

# RNN网络结构图
![rnn](images/rnn.png)

RNN公式：
![rnn_rule](images/rnn_rule.png)

# 观察torch.nn.RNN的输入输出

In [1]:
import torch
import torch.nn as nn

In [3]:
# 单向、单层rnn
single_rnn = nn.RNN(input_size=4, hidden_size=3, num_layers=1, batch_first=True) # batch_first=True表示输入数据的维度为[batch_size, seq_len, input_size]
input = torch.randn(1, 5, 4) # 输入数据维度为[batch_size, seq_len, input_size]
output, h_n = single_rnn(input) # output维度为[batch_size, seq_len, hidden_size=3]，h_n维度为[num_layers=1, batch_size, hidden_size=3]
print(output, output.shape, h_n, h_n.shape,  sep='\n')

tensor([[[ 0.1926, -0.5641, -0.1246],
         [ 0.3857, -0.5942,  0.3756],
         [-0.7565, -0.9860, -0.6089],
         [ 0.1879, -0.8991, -0.3685],
         [ 0.4113, -0.8877, -0.5903]]], grad_fn=<TransposeBackward1>)
torch.Size([1, 5, 3])
tensor([[[ 0.4113, -0.8877, -0.5903]]], grad_fn=<StackBackward0>)
torch.Size([1, 1, 3])


In [4]:
output[:, 2, :] 

tensor([[-0.7565, -0.9860, -0.6089]], grad_fn=<SliceBackward0>)

In [6]:
# 双向、单层rnn
bi_rnn = nn.RNN(input_size=4, hidden_size=3, num_layers=1, batch_first=True, bidirectional=True)
bi_output, bi_h_n = bi_rnn(input)
print(bi_output, bi_output.shape, bi_h_n, bi_h_n.shape, sep='\n')

tensor([[[ 0.9352, -0.1535,  0.3298,  0.8335, -0.9024, -0.6281],
         [ 0.3034, -0.5223, -0.9183,  0.8817, -0.4630, -0.6553],
         [ 0.9745, -0.4444,  0.7889,  0.9376, -0.6616, -0.8148],
         [ 0.7716, -0.2623, -0.8482,  0.7856, -0.1788, -0.9494],
         [ 0.7237, -0.5549, -0.1000,  0.7960, -0.4034, -0.1305]]],
       grad_fn=<TransposeBackward1>)
torch.Size([1, 5, 6])
tensor([[[ 0.7237, -0.5549, -0.1000]],

        [[ 0.8335, -0.9024, -0.6281]]], grad_fn=<StackBackward0>)
torch.Size([2, 1, 3])


# 从零手搓 RNN 

## forword

In [8]:
import torch
import torch.nn as nn

In [9]:
batch_size, seq_len, input_size, hidden_size = 2, 3, 2, 3 # 批次大小、序列长度、输入维度、隐藏层维度
num_layers = 1 # rnn层数

input = torch.randn(batch_size, seq_len, input_size) # 初始化输入数据
h_prev = torch.zeros(batch_size, hidden_size) # 初始化隐藏层状态

In [10]:
rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True) # 初始化rnn
rnn_output, h_n = rnn(input, h_prev.unsqueeze(0)) # rnn输出和隐藏层状态
print(rnn_output, rnn_output.shape, h_n, h_n.shape, sep='\n')

tensor([[[ 0.8950,  0.4833, -0.5547],
         [ 0.1424,  0.4373, -0.6090],
         [-0.5399,  0.7522, -0.0882]],

        [[ 0.2022,  0.8119, -0.3363],
         [ 0.0075,  0.5236, -0.5589],
         [ 0.6017,  0.2810, -0.6089]]], grad_fn=<TransposeBackward1>)
torch.Size([2, 3, 3])
tensor([[[-0.5399,  0.7522, -0.0882],
         [ 0.6017,  0.2810, -0.6089]]], grad_fn=<StackBackward0>)
torch.Size([1, 2, 3])


In [11]:
rnn.state_dict()

OrderedDict([('weight_ih_l0',
              tensor([[ 0.5479,  0.4916],
                      [-0.2934, -0.2234],
                      [-0.2745, -0.0150]])),
             ('weight_hh_l0',
              tensor([[ 0.1761, -0.3001,  0.5395],
                      [-0.2634, -0.2903,  0.3202],
                      [-0.4855,  0.2617, -0.0028]])),
             ('bias_ih_l0', tensor([ 0.4256,  0.4981, -0.3173])),
             ('bias_hh_l0', tensor([ 0.1950,  0.4163, -0.2147]))])

In [12]:
class RNN：
def __init__(self,input_size=4, hidden_size=3, num_layers=1, batch_first=True):
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
def rnn_forward(input, W_ih, W_hh, b_ih, b_hh, h_prev):
    batch_size, seq_len, input_size = input.shape
    hidden_size = W_ih.shape[0] # 隐藏层维度, seq_len就等于hidden_size，所以是W_ih.shape[0]
    h_output = torch.zeros(batch_size, seq_len, hidden_size) # 初始化一个输出矩阵output 看官方参数来定义
    for t in range(seq_len):
        x_t = input[:, t, :].unsqueeze(2) # input[:,t,:].shape = [batch_size,input_size] -> (batch_size,input_size,1)

        # w_ih_batch.shape = [hidden_size,input_size]->(1,hidden_size,input_size)->(batch_size,hidden_size,input_size)
        # tile(batch_size, 1, 1): 第0维变成原来的batch_size倍（默认行复制）其他两维为1保持不动-> (batch_size,hidden_size,input_size)
        w_ih_batch = W_ih.unsqueeze(0).tile(batch_size, 1, 1)

        # w_hh_batch.shaoe = [hidden_size,input_size]->(1,hidden_size,input_size)->(batch_size,hidden_size,input_size)
        w_hh_batch = W_hh.unsqueeze(0).tile(batch_size, 1, 1)

        # w_ih_times_x.shape=(batch_size,hidden_size,1) -> (batch_size,hidden_size)
        w_ih_times_x = torch.bmm(w_ih_batch, x_t).squeeze(-1)  # W_ih * x_t

        # h_prev.unsqueeze(2) : (batch_size,hidden_size,1)
        # w_hh_times_h.shape =(batch_size,hidden_size,1)->(batch_size,hidden_size)
        w_hh_times_h = torch.bmm(w_hh_batch, h_prev.unsqueeze(2)).squeeze(-1)

        # h_prev = (1,batch_size,hidden_size)->(batch_size, hidden_size)
        h_prev = torch.tanh(w_ih_times_x + b_ih + w_hh_times_h + b_hh)

        h_output[:,t,:] = h_prev
        
    # 按官方api格式返回
    # h_prev.unsqueeze(0) : (1,batch_size,hidden_size) 因为官方参数为(D∗num_layers,bs,hidden_size)
    return h_output, h_prev.unsqueeze(0)

In [13]:
rnn_output, h_n = rnn(input, h_prev.unsqueeze(0))
custom_output, custom_hn = rnn_forward(input, rnn.weight_ih_l0, rnn.weight_hh_l0, rnn.bias_ih_l0, rnn.bias_hh_l0, h_prev)
print('custom', rnn_output, rnn_output.shape, h_n, h_n.shape, sep='\n')
print('torch api', custom_output, custom_output.shape, custom_hn, custom_hn.shape, sep='\n')

custom
tensor([[[ 0.8950,  0.4833, -0.5547],
         [ 0.1424,  0.4373, -0.6090],
         [-0.5399,  0.7522, -0.0882]],

        [[ 0.2022,  0.8119, -0.3363],
         [ 0.0075,  0.5236, -0.5589],
         [ 0.6017,  0.2810, -0.6089]]], grad_fn=<TransposeBackward1>)
torch.Size([2, 3, 3])
tensor([[[-0.5399,  0.7522, -0.0882],
         [ 0.6017,  0.2810, -0.6089]]], grad_fn=<StackBackward0>)
torch.Size([1, 2, 3])
torch api
tensor([[[ 0.8950,  0.4833, -0.5547],
         [ 0.1424,  0.4373, -0.6090],
         [-0.5399,  0.7522, -0.0882]],

        [[ 0.2022,  0.8119, -0.3363],
         [ 0.0075,  0.5236, -0.5589],
         [ 0.6017,  0.2810, -0.6089]]], grad_fn=<CopySlices>)
torch.Size([2, 3, 3])
tensor([[[-0.5399,  0.7522, -0.0882],
         [ 0.6017,  0.2810, -0.6089]]], grad_fn=<UnsqueezeBackward0>)
torch.Size([1, 2, 3])


## 训练

### 数据预处理
- 读取数据集：
  首先需要获取周杰伦的歌词数据集，可以从网络上搜索整理其歌词文本，将所有歌词保存到一个文本文件中，如jaychou_lyrics.txt。使用 Python 的open()函数读取文件内容，并进行必要的字符编码转换 。
- 建立字符索引：
将歌词中的每个字符映射为一个从 0 开始的连续整数索引，构建字符到索引的字典char_to_idx以及索引到字符的字典idx_to_char。通过遍历歌词文本，找出所有不同的字符，然后为每个字符分配一个唯一的索引。同时，可以得到词典大小vocab_size，即不同字符的数量 。
- 数据采样:

  对处理后的数据进行采样，以便生成训练所需的小批量数据。常见的采样方式有随机采样和相邻采样两种 ：
- 随机采样：
  每次从数据中随机选择一定长度的连续字符序列作为一个样本，同时对应的下一个字符作为该样本的标签。例如，若设定时间步数为num_steps，则每次随机选取num_steps个连续字符作为输入样本，其后面的一个字符作为输出标签。
相邻采样：按照顺序依次选取连续的字符序列作为样本和标签，即第i个样本的输入是从i到i + num_steps - 1的字符序列，其标签则是从i + 1到i + num_steps的字符序列。

### 模型训练
参数初始化：初始化模型的参数，如词嵌入维度embedding_dim、隐藏层维度hidden_dim等，并定义损失函数和优化器。例如，可以使用交叉熵损失函数nn.CrossEntropyLoss()和随机梯度下降优化器torch.optim.SGD() 。
训练循环：在训练循环中，按照设定的批次大小和采样方式获取训练数据，将数据输入到模型中进行前向传播，计算损失值，然后使用优化器进行反向传播更新模型参数。在每个训练周期，可以打印出当前的损失值，以观察模型的训练进度 。

In [1]:
def train(model, data_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        hidden = None
        total_loss = 0
        for batch_x, batch_y in data_loader:
            optimizer.zero_grad()
            output, hidden = model(batch_x, hidden)
            loss = criterion(output, batch_y.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
            optimizer.step()
            total_loss += loss.item()
            hidden = hidden.detach()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(data_loader)}')

### 模型测试与效果评估
- 生成歌词：训练完成后，可以使用训练好的模型来生成周杰伦风格的歌词。给定一个起始字符或字符序列，通过模型预测下一个可能的字符，然后将预测的字符作为新的输入，继续预测下一个字符，以此类推，生成一段歌词 。

In [2]:
def generate_text(model, char_to_idx, idx_to_char, start_text, length):
    model.eval()
    with torch.no_grad():
        input_text = torch.tensor([char_to_idx[char] for char in start_text]).unsqueeze(0)
        hidden = None
        generated_text = start_text
        for _ in range(length):
            output, hidden = model(input_text, hidden)
            output_probs = torch.softmax(output, dim=1)
            top_prob, top_idx = torch.topk(output_probs, k=1)
            top_char = idx_to_char[top_idx.item()]
            generated_text += top_char
            input_text = torch.tensor([top_idx]).unsqueeze(0)
        return generated_text

- 效果评估：可以从多个角度评估生成歌词的效果，如歌词的通顺性、连贯性、是否符合周杰伦的风格等。一种简单的方法是人工观察和评价生成的歌词，判断其是否具有一定的合理性和艺术感。也可以使用一些自动评估指标，如困惑度（Perplexity）等来定量地评估模型的性能，但困惑度指标并非完全能够准确反映生成文本的质量，仅供参考.

In [3]:
def calculate_perplexity(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    total_count = 0
    with torch.no_grad():
        for batch_x, batch_y in data_loader:
            output, _ = model(batch_x, None)
            loss = criterion(output, batch_y.view(-1))
            total_loss += loss.item() * batch_y.numel()
            total_count += batch_y.numel()
    return torch.exp(torch.tensor(total_loss / total_count))

通过以上步骤，就可以利用周杰伦的歌词训练 PyTorch RNN 模型，并对生成歌词的效果进行测试和评估 。需要注意的是，由于歌词的生成具有一定的主观性和创造性，模型的表现可能会因多种因素而有所不同，可通过调整模型结构、参数、训练数据等方式来进一步优化模型的性能 。

In [None]:
s