## Vanilla-LSTM-代码

### Numpy实现

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time
sns.set()

def get_vocab(file, lower = False):
    with open(file, 'r') as fopen:
        data = fopen.read() # 将文件中的所有数据读取进来。
    if lower:
        data = data.lower()
    
    vocab = list(set(data))
    return data, vocab

def embed_to_control(data, vocab):
    onehot = np.zeros((len(data), len(vocab)), dtype = np.float32)
    for i in range(len(data)):
        onehot[i, vocab.index(data[i])] = 1.0
    return onehot


text, text_vocab = get_vocab('../consumer.h', lower = False)
one_hot = embed_to_control(text, text_vocab)
print('len text: ', len(text))
print('len text_vocab: ', len(text_vocab))
print('one_hot shape: ', one_hot.shape)

epoch = 1000
learning_rate = 0.0001
batch_size = 64
sequence_length = int(12)
dimension = one_hot.shape[1]
print('dimension is :', dimension)
possible_batch_id = range(len(text) - sequence_length - 1)
hidden_dim = 128

len text:  15294
len text_vocab:  75
one_hot shape:  (15294, 75)
dimension is : 75


&emsp;&emsp;`tanh`激活函数为:

$$
\tanh x=\frac{\sinh x}{\cosh x}=\frac{e^{x}-e^{-x}}{e^{x}+e^{-x}}
$$

&emsp;&emsp;导数为:

$$
(\tanh x)^{\prime}=\operatorname{sech}^{2} x=1-\tanh ^{2} x
$$

In [2]:
def tanh(x, grad=False):
    if grad:
        output = np.tanh(x)
        return (1.0 - np.square(output))
    else:
        return np.tanh(x)

In [3]:
def softmax(x):
    """
    x: np.max(x)取的是二维数组x中的最大值。
    """
    exp_scores = np.exp(x - np.max(x))
    return exp_scores / (np.sum(exp_scores, axis=1, keepdims=True) + 1e-8)

&emsp;&emsp;多分类的交叉熵损失如下:

$$
L=\frac{1}{N} \sum_{i} L_{i}=-\frac{1}{N} \sum_{i} \sum_{c=1}^{M} y_{i c} \log \left(p_{i c}\right)
$$

&emsp;&emsp;其中$M$表示类别的数量，$y_{ic}$是符号函数(0或者1), 如果样本$i$的真实类别等于$c$取1，否者取0。$p_{ic}$表示观测样本$i$属于类别$c$的概率。

In [4]:
def cross_entropy(Y_hat, Y, epsilon=1e-12):
    Y_hat = np.clip(Y_hat, epsilon, 1. - epsilon)
    N = Y_hat.shape[0]
    return -np.sum(np.sum(Y * np.log(Y_hat + 1e-9))) / N

In [5]:
def backward_multiply_gate(w, x, dz):
    """
    w shape = (75, 128)
    x shape = (64, 128)
    dz shape = (64, 75)
    """
    dw = np.dot(dz.T, x) # shape = (75, 128)
    dx = np.dot(w.T, dz.T) # shape = (128, 64)
    return dw, dx


def backward_add_gate(x1, x2, dz):
    dx1 = dz * np.ones_like(x1)
    dx2 = dz * np.ones_like(x2)
    return dx1, dx2

&emsp;&emsp;首先依据公式创建三个门的权重:

$$
\begin{aligned}
\boldsymbol{I}_{t} &=\sigma\left(\boldsymbol{X}_{t} \boldsymbol{W}_{x i}+\boldsymbol{H}_{t-1} \boldsymbol{W}_{h i}+\boldsymbol{b}_{i}\right) \\
\boldsymbol{F}_{t} &=\sigma\left(\boldsymbol{X}_{t} \boldsymbol{W}_{x f}+\boldsymbol{H}_{t-1} \boldsymbol{W}_{h f}+\boldsymbol{b}_{f}\right) \\
\boldsymbol{O}_{t} &=\sigma\left(\boldsymbol{X}_{t} \boldsymbol{W}_{x o}+\boldsymbol{H}_{t-1} \boldsymbol{W}_{h o}+\boldsymbol{b}_{o}\right)
\end{aligned}
$$

In [6]:
W_hi = np.random.randn(hidden_dim, hidden_dim) / np.sqrt(hidden_dim)
W_hf = np.random.randn(hidden_dim, hidden_dim) / np.sqrt(hidden_dim)
W_ho = np.random.randn(hidden_dim, hidden_dim) / np.sqrt(hidden_dim)

&emsp;&emsp;之后需要产生**候选记忆单元**：

$$
\tilde{\boldsymbol{C}}_{t}=\tanh \left(\boldsymbol{X}_{t} \boldsymbol{W}_{x c}+\boldsymbol{H}_{t-1} \boldsymbol{W}_{h c}+\boldsymbol{b}_{c}\right)
$$

&emsp;&emsp;**记忆单元**:

$$
\boldsymbol{C}_{t}=\boldsymbol{F}_{t} \odot \boldsymbol{C}_{t-1}+\boldsymbol{I}_{t} \odot \tilde{\boldsymbol{C}}_{t}
$$

&emsp;&emsp;`LSTM`与`RNN、GRU`的区别是里面所含的状态有两个，一个是$C$，另外一个是$H$。

In [7]:
W_hc = np.random.randn(hidden_dim, hidden_dim) / np.sqrt(hidden_dim)

&emsp;&emsp;**隐状态**:

$$
\boldsymbol{H}_{t}=\boldsymbol{O}_{t} \odot \tanh \left(\boldsymbol{C}_{t}\right)
$$

&emsp;&emsp;$\tanh$是为了保证值是在`+1`到`-1`之间的。

&emsp;&emsp;之后我们再创建一个处理输入的权重矩阵，本来是有四个的$W_{xi}, W_{xf}, W_{xo}, W_{xc}$，这里我们为了方便起见，统一为一个$U$

In [8]:
U = np.random.randn(hidden_dim, dimension)

&emsp;&emsp;之后还有一个输出层的权重，也将其全部设置为$V$:

In [9]:
V = np.random.randn(dimension, hidden_dim) / np.sqrt(hidden_dim)

In [10]:
def sigmoid(x, grad=False):
    if grad:
        return sigmoid(x) * (1 - sigmoid(x))
    else:
        return 1 / (1 + np.exp(-x))

def forward_lstm_recurrent(x, c_state, h_state, U, W_hf, W_hi, W_hc, W_ho, V):
    
    # 计算输入的处理单元。
    mul_u = np.dot(x, U.T)
    
    # 计算遗忘门
    mul_Wf = np.dot(h_state, W_hf.T)
    add_Wf = mul_u + mul_Wf
    f = sigmoid(add_Wf)
    
    # 计算输入门
    mul_Wi = np.dot(h_state, W_hi.T)
    add_Wi = mul_u + mul_Wi
    i = sigmoid(add_Wi)
    
    # 计算候选记忆单元
    mul_Wc = np.dot(h_state, W_hc.T)
    add_Wc = mul_u + mul_Wc
    c_hat = tanh(add_Wc)
    
    # 记忆单元选择需要从之前的c_state中遗忘多少，从输入门中提取多少候选记忆单元。
    C = c_state * f + i * c_hat
    
    # 输出门
    mul_Wo = np.dot(h_state, W_ho.T)
    add_Wo = mul_u + mul_Wo
    o = sigmoid(add_Wo)
    
    # 计算隐藏状态。
    h = o * tanh(C)
    
    mul_v = np.dot(h, V.T)
    return (mul_u, mul_Wf, add_Wf, mul_Wi, add_Wi, mul_Wc, add_Wc, C, mul_Wo, add_Wo, h, mul_v, i, o, c_hat)

In [11]:
def backward_recurrent(x, c_state, h_state, U, Wf, Wi, Wc, Wo, V, d_mul_v, saved_graph):
    mul_u, mul_Wf, add_Wf, mul_Wi, add_Wi, mul_Wc, add_Wc, C, mul_Wo, add_Wo, h, mul_v, i, o, c_hat = saved_graph
    dV, dh = backward_multiply_gate(V, h, d_mul_v)
    dC = tanh(C, True) * o * dh.T
    do = tanh(C) * dh.T
    dadd_Wo = sigmoid(add_Wo, True) * do
    dmul_u1, dmul_Wo = backward_add_gate(mul_u, mul_Wo, dadd_Wo)
    dWo, dprev_state = backward_multiply_gate(Wo, h_state, dmul_Wo)
    dc_hat = dC * i
    dadd_Wc = tanh(add_Wc, True) * dc_hat
    dmul_u2, dmul_Wc = backward_add_gate(mul_u, mul_Wc, dadd_Wc)
    dWc, dprev_state = backward_multiply_gate(Wc, h_state, dmul_Wc)
    di = dC * c_hat
    dadd_Wi = sigmoid(add_Wi, True) * di
    dmul_u3, dmul_Wi = backward_add_gate(mul_u, mul_Wi, dadd_Wi)
    dWi, dprev_state = backward_multiply_gate(Wi, h_state, dmul_Wi)
    df = dC * c_state
    dadd_Wf = sigmoid(add_Wf, True) * df
    dmul_u4, dmul_Wf = backward_add_gate(mul_u, mul_Wf, dadd_Wf)
    dWf, dprev_state = backward_multiply_gate(Wf, h_state, dmul_Wf)
    dU, dx = backward_multiply_gate(U, x, dmul_u4)
    return (dU, dWf, dWi, dWc, dWo, dV)

In [12]:
for i in range(epoch):
    batch_x = np.zeros((batch_size, sequence_length, dimension))
    batch_y = np.zeros((batch_size, sequence_length, dimension))
    batch_id = random.sample(possible_batch_id, batch_size)
    
    prev_c = np.zeros((batch_size, hidden_dim))
    prev_h = np.zeros((batch_size, hidden_dim))
    
    for n in range(sequence_length):
        id1 = [k + n for k in batch_id]
        id2 = [k + n + 1 for k in batch_id]
        batch_x[:, n, :] = one_hot[id1]
        batch_y[:, n, :] = one_hot[id2]
        
    layers = []
    out_logits = np.zeros((batch_size, sequence_length, dimension))
    
    for n in range(sequence_length):
        layers.append(forward_lstm_recurrent(batch_x[:, n, :], prev_c, prev_h, U, W_hf, W_hi, W_hc, W_ho, V))
        
        prev_c = layers[-1][7]
        prev_h = layers[-1][10]
        
        out_logits[:, n, :] = layers[-1][-4]
        
    probs = softmax(out_logits.reshape((-1, dimension)))
    y = np.argmax(batch_y.reshape((-1, dimension)), axis=1)
    accuracy = np.mean(np.argmax(probs, axis=1) == y)
    loss = cross_entropy(probs, batch_y.reshape((-1, dimension)))
    
    delta = probs
    delta[range(y.shape[0]), y] -= 1
    delta = delta.reshape((batch_size, sequence_length, dimension))
    
    dU = np.zeros(U.shape)
    dV = np.zeros(V.shape)
    dW_hf = np.zeros(W_hf.shape)
    dW_hi = np.zeros(W_hi.shape)
    dW_hc = np.zeros(W_hc.shape)
    dW_ho = np.zeros(W_ho.shape)
    
    prev_c = np.zeros((batch_size, hidden_dim))
    prev_h = np.zeros((batch_size, hidden_dim))
    
    for n in range(sequence_length):
        d_mul_v = delta[:, n, :]
        dU_t, dWf_t, dWi_t, dWc_t, dWo_t, dV_t = backward_recurrent(batch_x[:,n,:], prev_c, prev_h, U, W_hf, W_hi, 
                                                                    W_hc, W_ho, V, d_mul_v, layers[n])
        prev_c = layers[n][7]
        prev_h = layers[n][10]
        dU += dU_t
        dV += dV_t
        dW_hf += dWf_t
        dW_hi += dWi_t
        dW_hc += dWc_t
        dW_ho += dWo_t
    U -= learning_rate * dU
    V -= learning_rate * dV
    W_hf -= learning_rate * dW_hf
    W_hi -= learning_rate * dW_hi
    W_hc -= learning_rate * dW_hc
    W_ho -= learning_rate * dW_ho
    if (i+1) % 50 == 0:
        print('epoch {}, loss {}, accuracy {}'.format(i+1, loss, accuracy))

epoch 50, loss 3.869480435620895, accuracy 0.1328125
epoch 100, loss 3.5236123521197826, accuracy 0.17838541666666666
epoch 150, loss 3.300833054781156, accuracy 0.20963541666666666
epoch 200, loss 3.059171981690467, accuracy 0.2760416666666667
epoch 250, loss 2.9879058625218793, accuracy 0.2799479166666667
epoch 300, loss 2.7253541986952823, accuracy 0.3567708333333333
epoch 350, loss 2.8991301010145087, accuracy 0.3111979166666667
epoch 400, loss 2.739520752820132, accuracy 0.3567708333333333
epoch 450, loss 2.7594340279988354, accuracy 0.3203125
epoch 500, loss 2.550837708695022, accuracy 0.4010416666666667
epoch 550, loss 2.564432655698529, accuracy 0.35546875
epoch 600, loss 2.636274814861799, accuracy 0.3346354166666667
epoch 650, loss 2.451738966036372, accuracy 0.3736979166666667
epoch 700, loss 2.4445363968472624, accuracy 0.3489583333333333
epoch 750, loss 2.3753177022147187, accuracy 0.4231770833333333
epoch 800, loss 2.3726650853133333, accuracy 0.40625
epoch 850, loss 2.43

### Pytorch实现

In [13]:
import torch
import torch.nn as nn
from torch.autograd import Variable

class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        """
        """
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        super(LSTM, self).__init__()
        self.LSTM = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_dim))
        c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_dim))
        # out, (hn, hc) = self.LSTM(x, (h0, c0))
        out, (hn, hc) = self.LSTM(x, None)  # 得到所有时间序列的输出。
        
        # out = self.fc(out[:, -1, :])  # 取最后一个时间步的输出。
        out = self.fc(out)
        
        return out
lstm = LSTM(input_dim = dimension, hidden_dim = hidden_dim, num_layers=1, output_dim = dimension)
print(lstm)


# 分开定义softmax运算和交叉熵损失函数会造成数值不稳定。
# 因此PyTorch提供了一个具有良好数值稳定性且包括softmax运算和交叉熵计算的函数。
criterion = nn.CrossEntropyLoss()

learning_rate = 0.01
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

for i in range(epoch):
    batch_x = np.zeros((batch_size, sequence_length, dimension), dtype=np.float32)
    batch_y = np.zeros((batch_size, sequence_length, dimension), dtype=np.float32)
    batch_id = random.sample(possible_batch_id, batch_size)  # 随机采样，选择batch_id。
    # prev_s = np.zeros((batch_size, hidden_dim))
    for n in range(sequence_length):
        id1 = [k + n for k in batch_id]
        id2 = [k + n + 1 for k in batch_id]
        
        batch_x[:, n, :] = one_hot[id1]
        batch_y[:, n, :] = one_hot[id2]
    
    # 从Numpy转成torch之后送入神经网络中去。
    output = lstm(torch.from_numpy(batch_x))  # torch.Size([64, 12, 75])
    label = torch.argmax(torch.from_numpy(batch_y).view(-1, dimension), dim=1) # shape = 786
    
    accuracy = np.mean(torch.argmax(output.view(-1, dimension), axis=1).numpy() == label.numpy())
    
    optimizer.zero_grad()
    loss = criterion(output.view(-1, dimension), label)
    loss.backward()
    optimizer.step()
    
    if(i + 1) % 50 == 0:
        print("epoch {}, loss {}, accuracy {}".format(i+1, loss.item(), accuracy)) 

LSTM(
  (LSTM): LSTM(75, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=75, bias=True)
)
epoch 50, loss 2.037531852722168, accuracy 0.4661458333333333
epoch 100, loss 1.0574300289154053, accuracy 0.7330729166666666
epoch 150, loss 0.7793861031532288, accuracy 0.7682291666666666
epoch 200, loss 0.6055523753166199, accuracy 0.83203125
epoch 250, loss 0.67572021484375, accuracy 0.7955729166666666
epoch 300, loss 0.5834397673606873, accuracy 0.8203125
epoch 350, loss 0.5546103119850159, accuracy 0.8177083333333334
epoch 400, loss 0.5669826865196228, accuracy 0.828125
epoch 450, loss 0.544745147228241, accuracy 0.8359375
epoch 500, loss 0.49985525012016296, accuracy 0.8268229166666666
epoch 550, loss 0.4533112049102783, accuracy 0.8489583333333334
epoch 600, loss 0.4929572343826294, accuracy 0.83203125
epoch 650, loss 0.4681812524795532, accuracy 0.8450520833333334
epoch 700, loss 0.4950845241546631, accuracy 0.8333333333333334
epoch 750, loss 0.5215380787849426, accura