## 简介
指定一个时间步骤的长度M,将数字序列按这个长度等分N个子序列,每个子序列(时间步骤)的数字(词元)计算历史状态和当前值计算当前状态,当前状态预测下一个数字,当前状态传递给下一个数字
    H_cur=activation(X*W_xh + H_pre*W_hh + b_h)  #当前状态计算
    Y=H_cur*W_hy    #根据当前状态计算Y
其中参数:     
    W_xh:当前值->当前状态的权重
    W_hh:历史状态->当前状态的权重
    b_h: 当前状态的偏置
    W_hy:当前状态->预测值


### 模型质量评估方式: 
损失函数是预测数字和实际值的交叉熵损失. 但评估的是一段生成内容的连续性和准确性,用perlexity(困惑度)表示,计算: exp(这段内容的每个数字的交叉熵损失的平均值)

### 数据格式:
一篇文章->一段顺序长数字序列,截取[0:-2]->输入值: 等分batchsize个batch->每个batch等分M个子序列,每个子序列视为一个时间步骤
                            截取[1:-1]->真实值,就是输入值的下一个数字       
### 注:     
普通意义的batchsize表示一个epoch中的一次迭代取多少数据样本(比如一次取32张图像),迭代次数是样本总量/batchsize
在这里表示batchsize=2表示原始序列分成2个长序列,每次迭代长序列中顺序取一个子序列,等于2*nums_input的数据样本,迭代次数是样本总量/batchize/nums_input


## 模型定义

In [8]:
import torch
import pytorch_lightning as pl

class Lit_RNNModel2(pl.LightningModule):
    def __init__(self, input_size, hidden_size, output_size, lr=0.001):
        super(Lit_RNNModel2, self).__init__()
        self.save_hyperparameters()
        self.rnn = torch.nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, output_size)
        self.lr = lr

    def forward(self, x, h):
        out, h = self.rnn(x, h)
        out = self.fc(out)
        return out, h

    def training_step(self, batch, batch_idx):
        x, y = batch
        h = torch.zeros(1, x.size(0), self.hparams.hidden_size, device=x.device)
        y_pred, _ = self(x, h)
        loss = torch.nn.functional.cross_entropy(y_pred.view(-1, y_pred.size(-1)), y.view(-1))
        self.log('train_loss', loss)
        self.log('train_perplexity', torch.exp(loss))
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        h = torch.zeros(1, x.size(0), self.hparams.hidden_size, device=x.device)
        y_pred, _ = self(x, h)
        loss = torch.nn.functional.cross_entropy(y_pred.view(-1, y_pred.size(-1)), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [None]:
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

class Lit_RNNModel(pl.LightningModule):
    #nums_input和nums_outputs相等,值为字典大小,即one-hot向量的长度
    #nums_hidden是神经元个数,可以自己设定
    def __init__(self, nums_input, nums_hidden,sigma=0.01,lr=0.01):
        super(Lit_RNNModel, self).__init__()
        self.save_hyperparameters()
        self.W_xh = torch.nn.Parameter(torch.randn(nums_input, nums_hidden)*sigma)
        self.W_hh = torch.nn.Parameter(torch.randn(nums_hidden, nums_hidden)*sigma)
        self.W_hy = torch.nn.Parameter(torch.randn(nums_hidden, nums_input)* sigma)
        self.b_h = torch.nn.Parameter(torch.zeros(nums_hidden))

    def __one_hot(self, x):
        #x.shape: (batch_size, seq_len), seq_len是子序列长度
        x = torch.nn.functional.one_hot(x, num_classes=self.hparams.nums_input).float()
        return x #shape: (batch_size, seq_len, input_size)

    def forward(self, x,h): 
        outputs=[]
        x = self.__one_hot(x)
        #x.shape: (batch_size, seq_len, input_size), seq_len是子序列长度, input_size是字典大小
        for i in range(x.size(1)): #子序列的词元作循环
            h= torch.tanh(x[:,i,:]@self.W_xh + h@self.W_hh + self.b_h) 
            y = h@self.W_hy
            outputs.append(y)
        return torch.stack(outputs, dim=1),h
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        h= torch.zeros(x.size(0), self.hparams.nums_hidden,device=x.device)
        y_pred,_ = self(x,h) #y_pred.shape: (batch_size, seq_len, output_size)
        loss = torch.nn.functional.cross_entropy(y_pred, y) #shape: (batch_size, seq_len, output_size)
        self.log('train_loss', loss, prog_bar=True, logger=True, on_epoch=True,on_step=True)
        self.log('train_perplexity', torch.exp(loss), prog_bar=True, logger=True, on_epoch=True,on_step=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        h = torch.zeros(x.size(0), self.hparams.nums_hidden,device=x.device)
        y_pred,_ = self(x,h)
        print(y_pred.shape)
        print(y.shape)
        loss = torch.nn.functional.cross_entropy(y_pred, y)
        self.log('val_loss', loss, prog_bar=True, logger=True, on_epoch=True,on_step=True)
        self.log('val_perplexity', torch.exp(loss), prog_bar=True, logger=True, on_epoch=True,on_step=True)
        return loss
    
    
    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=self.hparams.lr)
    

## 数据集加载

In [14]:
import requests
import os
import re
class LitLoadData_timeMachine(pl.LightningDataModule):
    def __init__(self, batch_size=32,seq_length=5,pin_memory=True,nums_train=10000,nums_val=5000):
        super().__init__()
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.pin_memory = pin_memory
        self.nums_train = nums_train
        self.nums_val = nums_val
        self.prepare_data()
        self.corpus_indices, self.char_to_idx, self.idx_to_char, self.vocab_size = self.load_data_time_machine()
        
    def prepare_data(self):
        url = 'http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt'
        #文件是否存在
        if os.path.exists('../data/timemachine.txt'):
            return
        #下载文件
        r = requests.get(url, stream=True)
        with open('../data/timemachine.txt', 'wb') as f:
            f.write(r.content)

    def load_data_time_machine(self):
        with open('../data/timemachine.txt') as f:
            corpus_chars = f.read()
        #非字母替换为空格,并转为小写
        corpus_chars = re.sub('[^A-Za-z]+', ' ', corpus_chars).lower()
        corpus_chars = corpus_chars[0:10000] # 只取前1万个字符
        #corpus_chars统计字符集,共26个字母+1个空格
        char_set=set(corpus_chars) 
        #增加'<unknown>'字符,防止用户输入非上述字母内容
        char_set.add('<unknown>')
        #索引到字符的映射
        idx_to_char = list(char_set) 
        #字符到索引的映射
        char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
        vocab_size = len(char_to_idx)  #28个字符
        corpus_indices = [char_to_idx[char] for char in corpus_chars] # 将每个字符转化为索引
        return corpus_indices, char_to_idx, idx_to_char, vocab_size #返回索引列表,字符到索引的映射,索引到字符的映射,字典大小

    def setup(self, stage=None):
        self.corpus_indices, self.char_to_idx, self.idx_to_char, self.vocab_size = self.load_data_time_machine()
        #self.corpus_indices = torch.tensor(self.corpus_indices) 
        #self.train_indices = self.corpus_indices[0: int(len(self.corpus_indices) * 0.8)] #前80%作为训练集
        #self.valid_indices = self.corpus_indices[int(len(self.corpus_indices) * 0.8):] #后20%作为验证集
        
        #d2l
        array=torch.tensor([self.corpus_indices[i:i+self.seq_length+1] for i in range(len(self.corpus_indices)-self.seq_length)])
        self.train_indices = array[0: self.nums_train] 
        self.valid_indices = array[self.nums_train: self.nums_train + self.nums_val]

    def train_dataloader(self):
        train_dataset = self.__dateset_d2l(self.train_indices)
        return torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4, pin_memory=self.pin_memory)
    
    def val_dataloader(self):
        valid_dataset = self.__dateset_d2l(self.valid_indices)
        return torch.utils.data.DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4, pin_memory=self.pin_memory)

    
    def __dateset_d2l(self, data_indices):
        return torch.utils.data.TensorDataset(data_indices[:, :-1], data_indices[:, 1:])

    #用于创建数据集对象。它根据序列长度将数据索引分割成多个样本，并将每个样本的输入和目标数据分别返回
    def __dataset(self, data_indices):
        num_samples = (len(data_indices) - 1) // self.seq_length #样本个数
        data_indices = data_indices[:num_samples * self.seq_length] #只取前num_samples * self.seq_length个字符
        data_indices = data_indices.reshape((num_samples, self.seq_length)) 
        return torch.utils.data.TensorDataset(data_indices[:, :-1], data_indices[:, 1:]) #每个样本的输入是前seq_length-1个字符,输出是后seq_length-1个字符

## 数据集加载-test

原文内容

In [None]:
with open('../data/timemachine.txt') as f:
    corpus_chars = f.read()
corpus_chars[:60]

'The Time Machine, by H. G. Wells [1898]\n\n\n\n\nI\n\n\nThe Time Tra'

大写字母转小写字母
非字母的字符替换为空格

In [None]:
import re
corpus_chars_=re.sub('[^A-Za-z]+', ' ', corpus_chars).lower() #非字母替换为空格
corpus_chars_[:60]

'the time machine by h g wells i the time traveller for so it'

统计原文中的所有字符,共26个字母+1个空格
增加'<unknown>'字符,防止用户输入非上述字母内容

In [None]:
tmp = set(corpus_chars_)
tmp.add('<unknown>')
len(tmp)


28

原文格式化后字符个数

In [None]:
print(len(corpus_chars_))
print(len(corpus_chars))

173428
178979


batch数据

In [15]:
data_module = LitLoadData_timeMachine(batch_size=1024,seq_length=32,pin_memory=False)
data_module.setup()
for x,y in data_module.train_dataloader():
    print(x.shape)
    print(y.shape)
    break

torch.Size([1024, 32])
torch.Size([1024, 32])


## 工作流程

In [10]:
if __name__ == '__main__':
    data_module = LitLoadData_timeMachine(batch_size=1024,seq_length=32,pin_memory=False)
    model= Lit_RNNModel(nums_input=data_module.vocab_size, nums_hidden= 32,lr=1)
    model2= Lit_RNNModel2(input_size=data_module.vocab_size, hidden_size=32, output_size=data_module.vocab_size, lr=1)
    checkpoint_callback=pl.callbacks.ModelCheckpoint(
        monitor='perplexity',
        dirpath='checkPoint-logs/RNNModel_v1',
        filename='RNNModel_v1_{epoch:02d}_{perplexity:.2f}',
        #save_top_k=3, # save the top 3 models
        mode='min',
    )
    trainer = pl.Trainer(
        max_epochs=100,
        accelerator='cpu',
        #devices=1,
        logger=TensorBoardLogger('tensorBoard-logs/', name='RNNModel_v1'),
        callbacks=[checkpoint_callback]
                        )
    
    # Move model to the appropriate device

    
    trainer.fit(model2, data_module)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\zncyxiong\AppData\Local\anaconda3\envs\pytorch_python3128\Lib\site-packages\pytorch_lightning\trainer\setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.

  | Name | Type   | Params | Mode 
----------------------------------------
0 | rnn  | RNN    | 2.0 K  | train
1 | fc   | Linear | 924    | train
----------------------------------------
2.9 K     Trainable params
0         Non-trainable params
2.9 K     Total params
0.012     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\zncyxiong\AppData\Local\anaconda3\envs\pytorch_python3128\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor