## 简介
LSTM层: 在RNN上增加4个单元:3个控制单元+1个记忆单元,实现对状态H管控
### 记忆单元:C(memory cell)
用于控制状态H的记忆C,通过更新记忆C来控制状态H
计算方法: 结果有用tanh转为[-1,1]
    C=tanh(X*W_xc+H_pre*W_hc+b_c)
### 控制单元
输入门,输出门,遗忘门
计算方法相同:结果用sigma转为[0,1]
    I=sig(X*W_xi+H_pre*W_hi+b_i)
    F=sig(X*W_xf+H_pre*W_hf+b_f)
    O=sig(X*W_xo+H_pre*W_ho+b_o)
#### input gate(输入门:I)
控制当前输入对当前记忆的影响, 比如当前输入是冗余信息时当前记忆不受当前输入影响,生成的当前状态保留了更多历史状态的信息
#### output gate(输出门:O)
控制记忆对状态H的影响, 比如当前记忆有时效性不应该对当前状态有影响 
#### forget gate(遗忘门:F)  
控制历史记忆对当前记忆的影响, 比如历史记忆无参考价值
### 工作原理
1. 用当前输入X和历史状态H_pre得到F,I,C_1,O
2. C_2=tanh(F.*C_2_pre+I.*C_1) #当前原生记忆用输入格式化,历史加工记忆用遗忘格式化,两者相加得到当前加工记忆
3. H=O.*C_2 #当前加工记忆用输出格式化后生成当前状态


## 模型定义
LSTM层原始实现

In [1]:
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

class Lit_LSTMModel(pl.LightningModule):
    def __init__(self, vocab_size, nums_hidden, nums_layers, lr,sigma=0.01):
        super(Lit_LSTMModel, self).__init__()
        self.save_hyperparameters()
        #input gate, forget gate, cell gate, output gate
        self.w_xi=torch.nn.Parameter(torch.randn(vocab_size, nums_hidden)*sigma)
        self.w_hi=torch.nn.Parameter(torch.randn(nums_hidden, nums_hidden)*sigma)
        self.b_i=torch.nn.Parameter(torch.zeros(nums_hidden))
        self.w_xf=torch.nn.Parameter(torch.randn(vocab_size, nums_hidden)*sigma)
        self.w_hf=torch.nn.Parameter(torch.randn(nums_hidden, nums_hidden)*sigma)
        self.b_f=torch.nn.Parameter(torch.zeros(nums_hidden))
        self.w_xc=torch.nn.Parameter(torch.randn(vocab_size, nums_hidden)*sigma)
        self.w_hc=torch.nn.Parameter(torch.randn(nums_hidden, nums_hidden)*sigma)
        self.b_c=torch.nn.Parameter(torch.zeros(nums_hidden))
        self.w_xo=torch.nn.Parameter(torch.randn(vocab_size, nums_hidden)*sigma)
        self.w_ho=torch.nn.Parameter(torch.randn(nums_hidden, nums_hidden)*sigma)
        self.b_o=torch.nn.Parameter(torch.zeros(nums_hidden))
        #y
        self.w_hy=torch.nn.Parameter(torch.randn(nums_hidden, vocab_size)*sigma)
        self.b_y=torch.nn.Parameter(torch.zeros(vocab_size))

    def forward(self, x, h_c=None):
        if h_c is None:
            h=torch.zeros(x.size(0), self.hparams.nums_hidden,device=x.device)
            c=torch.zeros(x.size(0), self.hparams.nums_hidden,device=x.device)
        else:
            h,c=h_c
        x=torch.nn.functional.one_hot(x, num_classes=self.hparams.vocab_size).float()
        output=[]
        for i in range(x.size(1)):
            i_gate=torch.sigmoid(x[:,i,:]@self.w_xi+self.b_i+h@self.w_hi)
            f_gate=torch.sigmoid(x[:,i,:]@self.w_xf+self.b_f+h@self.w_hf)
            c_gate=torch.tanh(x[:,i,:]@self.w_xc+self.b_c+h@self.w_hc)
            c=f_gate*c+i_gate*c_gate
            o_gate=torch.sigmoid(x[:,i,:]@self.w_xo+self.b_o+h@self.w_ho)
            h=o_gate*torch.tanh(c)
            output.append(h@self.w_hy+self.b_y)
        return torch.stack(output, dim=1), (h,c)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred,_ = self(x) #y_pred.shape: (batch_size, seq_len, output_size)
        loss= torch.nn.functional.cross_entropy(y_pred.view(-1, y_pred.size(-1)), y.view(-1)) #输入是(batchsize* seq_len, vocab_size)和(batcsize* seq_len),拉平子序列计算单字符损失
        self.log('train_loss', loss, prog_bar=True, logger=True, on_epoch=True,on_step=True) 
        #perplexeity用于评估大段文本的好坏,单字符loss不适合评估大段文本
        self.log('train_perplexity', torch.exp(loss), prog_bar=True, logger=True, on_epoch=True,on_step=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred,_ = self(x)
        loss= torch.nn.functional.cross_entropy(y_pred.view(-1, y_pred.size(-1)), y.view(-1))
        self.log('val_loss', loss, prog_bar=True, logger=True,on_epoch=True)
        self.log('val_perplexity', torch.exp(loss), prog_bar=True, logger=True,on_epoch=True)
        return loss
    
    
    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=self.hparams.lr)
    

## 模型定义(api)
和torch.RNN相同LSTM也因为有num_layers参数,h需要转为3D:(num_layers,batch_size,hidden_size)


In [None]:
import torch
import pytorch_lightning as pl

import torch.nn as nn

class LstmModel_api(Lit_LSTMModel):
    def __init__(self, vocab_size, nums_hidden, nums_layers, lr,sigma=0.01):
        super(LstmModel_api, self).__init__(vocab_size, nums_hidden, nums_layers, lr,sigma)
        self.lstm=nn.LSTM(vocab_size, nums_hidden, nums_layers, batch_first=True)
        self.fc=nn.Linear(nums_hidden, vocab_size)
        
    def forward(self, x, h_c=None):
        if h_c is None:
            h_c=(torch.zeros(self.hparams.nums_layers, x.size(0), self.hparams.nums_hidden,device=x.device),
                  torch.zeros(self.hparams.nums_layers, x.size(0), self.hparams.nums_hidden,device=x.device))
        x=torch.nn.functional.one_hot(x, num_classes=self.hparams.vocab_size).float()
        x,h_c=self.lstm(x,h_c)
        x=self.fc(x)
        return x,h_c
    
    
   
    



## 数据集加载

In [3]:
import requests
import os
import re
class LitLoadData_timeMachine(pl.LightningDataModule):
    def __init__(self, batch_size=32,seq_length=5,pin_memory=True,nums_train=10000,nums_val=5000):
        super().__init__()
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.pin_memory = pin_memory
        self.nums_train = nums_train
        self.nums_val = nums_val
        self.prepare_data()
        self.corpus_indices, self.char_to_idx, self.idx_to_char, self.vocab_size = self.load_data_time_machine()
        
    def prepare_data(self):
        url = 'http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt'
        #文件是否存在
        if os.path.exists('../data/timemachine.txt'):
            return
        #下载文件
        r = requests.get(url, stream=True)
        with open('../data/timemachine.txt', 'wb') as f:
            f.write(r.content)

    def load_data_time_machine(self):
        with open('../data/timemachine.txt') as f:
            corpus_chars = f.read()
        #非字母替换为空格,并转为小写
        corpus_chars = re.sub('[^A-Za-z]+', ' ', corpus_chars).lower()
        #corpus_chars统计字符集,共26个字母+1个空格
        char_set=set(corpus_chars) 
        #增加'<unknown>'字符,防止用户输入非上述字母内容
        char_set.add('<unknown>')
        #索引到字符的映射
        idx_to_char = list(char_set) 
        #字符到索引的映射
        char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
        vocab_size = len(char_to_idx)  #28个字符
        corpus_indices = [char_to_idx[char] for char in corpus_chars] # 将每个字符转化为索引
        return corpus_indices, char_to_idx, idx_to_char, vocab_size #返回索引列表,字符到索引的映射,索引到字符的映射,字典大小

    def setup(self, stage=None):
        self.corpus_indices, self.char_to_idx, self.idx_to_char, self.vocab_size = self.load_data_time_machine()
        #self.corpus_indices = torch.tensor(self.corpus_indices) 
        #self.train_indices = self.corpus_indices[0: int(len(self.corpus_indices) * 0.8)] #前80%作为训练集
        #self.valid_indices = self.corpus_indices[int(len(self.corpus_indices) * 0.8):] #后20%作为验证集
        
        #d2l: step=1提取子序列,子序列个数=字符总数-子序列长度; 常规方法是等分,子序列个数=字符总数/子序列长度
        array=torch.tensor([self.corpus_indices[i:i+self.seq_length+1] for i in range(len(self.corpus_indices)-self.seq_length)])
        self.train_indices = array[0: self.nums_train] 
        self.valid_indices = array[self.nums_train: self.nums_train + self.nums_val]

    def train_dataloader(self):
        train_dataset = self.__dateset_d2l(self.train_indices)
        return torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4, pin_memory=self.pin_memory)
    
    def val_dataloader(self):
        valid_dataset = self.__dateset_d2l(self.valid_indices)
        return torch.utils.data.DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4, pin_memory=self.pin_memory)

    #子序列个数=N,输入取1:N-1,输出取2:N
    def __dateset_d2l(self, data_indices):
        return torch.utils.data.TensorDataset(data_indices[:, :-1], data_indices[:, 1:])

    #用于创建数据集对象。它根据序列长度将数据索引分割成多个样本，并将每个样本的输入和目标数据分别返回
    def __dataset(self, data_indices):
        num_samples = (len(data_indices) - 1) // self.seq_length #样本个数
        data_indices = data_indices[:num_samples * self.seq_length] #只取前num_samples * self.seq_length个字符
        data_indices = data_indices.reshape((num_samples, self.seq_length)) 
        return torch.utils.data.TensorDataset(data_indices[:, :-1], data_indices[:, 1:]) #每个样本的输入是前seq_length-1个字符,输出是后seq_length-1个字符

## 工作流程

In [4]:
if __name__ == '__main__':
    data_module = LitLoadData_timeMachine(batch_size=1024,seq_length=32,pin_memory=False)
    data_module.setup()

    ##############RNN模型训练################
    model=Lit_LSTMModel(
        vocab_size=data_module.vocab_size,
        nums_hidden=32,
        nums_layers=1,
        lr=4
    )
    model_api=LstmModel_api(
        vocab_size=data_module.vocab_size,
        nums_hidden=32,
        nums_layers=1,
        lr=4
    )

    checkpoint_callback=pl.callbacks.ModelCheckpoint(
        monitor='val_perplexity',
        dirpath='checkPoint-logs/RNNModel_v2',
        filename='RNNModel_v2_{epoch:02d}_{val_perplexity:.2f}',
        #save_top_k=3, # save the top 3 models
        mode='min',
    )
    trainer = pl.Trainer(
        max_epochs=100,
        gradient_clip_algorithm='norm', #梯度裁剪算法,等同clip_gradients(self, grad_clip_val, model)
        gradient_clip_val=1,
        accelerator='cpu',
        #devices=1,
        logger=TensorBoardLogger('tensorBoard-logs/', name='RNNModel_v2'),
        callbacks=[checkpoint_callback]
                        )
    #trainer.fit(model, data_module) 
    trainer.fit(model_api, data_module)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\zncyxiong\AppData\Local\anaconda3\envs\pytorch_python3128\Lib\site-packages\pytorch_lightning\trainer\setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
c:\Users\zncyxiong\AppData\Local\anaconda3\envs\pytorch_python3128\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory D:\algorithm\deeplearning_zh.d2l.ai\pytorch\checkPoint-logs\RNNModel_v2 exists and is not empty.

  | Name         | Type   | Params | Mode 
------------------------------------------------
0 | lstm         | LSTM   | 7.9 K  | train
1 | fc           | Linear | 924    | train
  | other params | n/a    | 8.7 K  | n/a  
------------------------------------------------
17.6 K    Trainable params
0         Non-trainable params
17.6 K    Total params
0.070     Total estimated model params size (MB)
2         Modules i

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\zncyxiong\AppData\Local\anaconda3\envs\pytorch_python3128\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

c:\Users\zncyxiong\AppData\Local\anaconda3\envs\pytorch_python3128\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.
c:\Users\zncyxiong\AppData\Local\anaconda3\envs\pytorch_python3128\Lib\site-packages\pytorch_lightning\loops\fit_loop.py:310: The number of training batches (10) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 99: 100%|██████████| 10/10 [00:09<00:00,  1.07it/s, v_num=7, train_loss_step=1.490, train_perplexity_step=4.420, val_loss=1.870, val_perplexity=6.540, train_loss_epoch=1.470, train_perplexity_epoch=4.350]

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 10/10 [00:09<00:00,  1.07it/s, v_num=7, train_loss_step=1.490, train_perplexity_step=4.420, val_loss=1.870, val_perplexity=6.540, train_loss_epoch=1.470, train_perplexity_epoch=4.350]


# tensorboard

In [23]:
%load_ext tensorboard
%tensorboard --logdir pytorch/tensorBoard-logs/RNNModel_v1

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
