In [1]:
#实现LSTM和LSTMP源码
import torch
import torch.nn as nn
rnn = nn.LSTM(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0))

In [6]:
#定义常量
bs, T, i_size, h_size = 2, 3, 4, 5
proj_size = 3#比hidden_size小
input = torch.randn(bs, T, i_size) #输入序列
c0 = torch.randn(bs, h_size) #初始值不参与训练，api里面维度(D∗num_layers,N,Hout)
h0 = torch.randn(bs, h_size)

#调用官方api
lstm_layer = nn.LSTM(i_size, h_size, batch_first = True)
output, (h_final, c_final) = lstm_layer(input, (h0.unsqueeze(0), c0.unsqueeze(0)))
# print(output)
# print(h_final)
'''
weight_ih_l0 torch.Size([20, 4]) 4个W拼起来 4个5*4
weight_hh_l0 torch.Size([20, 5]) 4个W拼起来 4个5*5
bias_ih_l0 torch.Size([20])
bias_hh_l0 torch.Size([20])
'''
for k, v in lstm_layer.named_parameters():
    print(k, v.shape)


weight_ih_l0 torch.Size([20, 4])
weight_hh_l0 torch.Size([20, 5])
bias_ih_l0 torch.Size([20])
bias_hh_l0 torch.Size([20])


In [4]:
#自己写一个LSTM模型
def lstm_forward(input, initial_states, w_ih, w_hh, b_ih, b_hh):
    h0, c0 = initial_states #初始状态
    bs, T, i_size = input.shape
    h_size = w_ih.shape[0] // 4

    prev_h = h0
    prev_c = c0
    batch_w_ih = w_ih.unsqueeze(0).tile(bs, 1, 1) #bs,4*hidden_size,i_size
    batch_w_hh = w_hh.unsqueeze(0).tile(bs, 1, 1)#bs,4*size, h_size

    output_size = h_size
    output = torch.zeros(bs, T, output_size) #输出序列

    for t in range(T):
        x = input[:,t, :] #当前时刻的输入向量,[bs, i_size],矩阵相乘后面加一个维度
        w_times_x = torch.bmm(batch_w_ih, x.unsqueeze(-1))  #bs, 4*h_size,1
        w_times_x = w_times_x.squeeze(-1) #bs, 4*h_size

        w_times_h_prev = torch.bmm(batch_w_hh, prev_h.unsqueeze(-1))  #bs, 4*h_size,1
        w_times_h_prev = w_times_h_prev.squeeze(-1) #bs, 4*h_size

        #分别计算输入们(i)\遗忘门(f)\cell门(g)\输出门(o)
        i_t = torch.sigmoid(w_times_x[:, :h_size] + \
            w_times_h_prev[:, :h_size] +b_ih[:h_size] + b_hh[:h_size])
        f_t = torch.sigmoid(w_times_x[:, h_size:2*h_size] + \
            w_times_h_prev[:, h_size:2*h_size] +b_ih[h_size:2*h_size] + b_hh[h_size:2*h_size])
        g_t = torch.tanh(w_times_x[:, 2*h_size:3*h_size] + w_times_h_prev[:,2*h_size:3*h_size]\
             +b_ih[2*h_size:3*h_size] + b_hh[2*h_size:3*h_size])
        o_t = torch.sigmoid(w_times_x[:, 3*h_size:] + \
            w_times_h_prev[:, 3*h_size:] +b_ih[3*h_size:] + b_hh[3*h_size:])
        
        #然后算记忆元ct,迭代实现
        prev_c = f_t * prev_c + i_t * g_t
        prev_h = o_t * torch.tanh(prev_c)
        output[:, t, :] = prev_h
    return output, (prev_h, prev_c)

output_custom, (h_final_custom, c_final_custom) = lstm_forward(input, (h0,c0), \
    lstm_layer.weight_ih_l0,lstm_layer.weight_hh_l0,lstm_layer.bias_ih_l0, lstm_layer.bias_hh_l0)

print(output)
print(output_custom)





tensor([[[ 0.1173, -0.1761,  0.0655, -0.0596, -0.1059],
         [ 0.3345, -0.1519,  0.0752, -0.2560,  0.0303],
         [ 0.2519, -0.1415,  0.1104, -0.2484, -0.0150]],

        [[-0.1127,  0.5996, -0.3856, -0.1163,  0.0772],
         [ 0.1967,  0.3045, -0.1864, -0.2168,  0.1104],
         [ 0.0886,  0.2985, -0.0083, -0.0245,  0.0345]]],
       grad_fn=<TransposeBackward0>)
tensor([[[ 0.1173, -0.1761,  0.0655, -0.0596, -0.1059],
         [ 0.3345, -0.1519,  0.0752, -0.2560,  0.0303],
         [ 0.2519, -0.1415,  0.1104, -0.2484, -0.0150]],

        [[-0.1127,  0.5996, -0.3856, -0.1163,  0.0772],
         [ 0.1967,  0.3045, -0.1864, -0.2168,  0.1104],
         [ 0.0886,  0.2985, -0.0083, -0.0245,  0.0345]]], grad_fn=<CopySlices>)


In [8]:
#加入proj_size
#定义常量
bs, T, i_size, h_size = 2, 3, 4, 5
proj_size = 3#比hidden_size小
input = torch.randn(bs, T, i_size) #输入序列
c0 = torch.randn(bs, h_size) #初始值不参与训练，api里面维度(D∗num_layers,N,Hout)
h0 = torch.randn(bs, proj_size) #修改输出

#调用官方api
lstm_layer = nn.LSTM(i_size, h_size, batch_first = True, proj_size=proj_size)
output, (h_final, c_final) = lstm_layer(input, (h0.unsqueeze(0), c0.unsqueeze(0)))
# print(output)
# print(h_final)
print(output.shape, h_final.shape, c_final.shape)#记忆元的输出大小不变，最终hidden_states变小
'''
weight_ih_l0 torch.Size([20, 4])
weight_hh_l0 torch.Size([20, 3]) 这里维度减少了
bias_ih_l0 torch.Size([20])
bias_hh_l0 torch.Size([20])
weight_hr_l0 torch.Size([3, 5]) 实现对hidden_states的压缩
'''
for k, v in lstm_layer.named_parameters():
    print(k, v.shape)


torch.Size([2, 3, 3]) torch.Size([1, 2, 3]) torch.Size([1, 2, 5])
weight_ih_l0 torch.Size([20, 4])
weight_hh_l0 torch.Size([20, 3])
bias_ih_l0 torch.Size([20])
bias_hh_l0 torch.Size([20])
weight_hr_l0 torch.Size([3, 5])


In [12]:
#自己写一个LSTM模型
def lstm_forward(input, initial_states, w_ih, w_hh, b_ih, b_hh, w_hr = None):
    h0, c0 = initial_states #初始状态
    bs, T, i_size = input.shape
    h_size = w_ih.shape[0] // 4

    prev_h = h0
    prev_c = c0
    batch_w_ih = w_ih.unsqueeze(0).tile(bs, 1, 1) #bs,4*hidden_size,i_size
    batch_w_hh = w_hh.unsqueeze(0).tile(bs, 1, 1)#bs,4*size, h_size

    if w_hr is not None:
        p_size = w_hr.shape[0]
        output_size = p_size
        batch_w_hr = w_hr.unsqueeze(0).tile(bs, 1, 1) #[bs, p_size, h_size]
    else:
        output_size = h_size

    output = torch.zeros(bs, T, output_size) #输出序列

    for t in range(T):
        x = input[:,t, :] #当前时刻的输入向量,[bs, i_size],矩阵相乘后面加一个维度
        
        w_times_x = torch.bmm(batch_w_ih, x.unsqueeze(-1))  #bs, 4*h_size,1,
        w_times_x = w_times_x.squeeze(-1) #bs, 4*h_size

#4*hidden_size , p_size   @   p_size * 1,有prev_h的地方计算量会减少
        w_times_h_prev = torch.bmm(batch_w_hh, prev_h.unsqueeze(-1))  #bs, 4*h_size,1，加入proj这里计算量小了
        w_times_h_prev = w_times_h_prev.squeeze(-1) #bs, 4*h_size

        #分别计算输入们(i)\遗忘门(f)\cell门(g)\输出门(o)
        i_t = torch.sigmoid(w_times_x[:, :h_size] + \
            w_times_h_prev[:, :h_size] +b_ih[:h_size] + b_hh[:h_size])
        f_t = torch.sigmoid(w_times_x[:, h_size:2*h_size] + \
            w_times_h_prev[:, h_size:2*h_size] +b_ih[h_size:2*h_size] + b_hh[h_size:2*h_size])
        g_t = torch.tanh(w_times_x[:, 2*h_size:3*h_size] + w_times_h_prev[:,2*h_size:3*h_size]\
             +b_ih[2*h_size:3*h_size] + b_hh[2*h_size:3*h_size])
        o_t = torch.sigmoid(w_times_x[:, 3*h_size:] + \
            w_times_h_prev[:, 3*h_size:] +b_ih[3*h_size:] + b_hh[3*h_size:])
        
        #然后算记忆元ct,迭代实现
        prev_c = f_t * prev_c + i_t * g_t
        prev_h = o_t * torch.tanh(prev_c) #[bs, h_size]
        if w_hr is not None:  #做proj
            prev_h = torch.bmm(batch_w_hr, prev_h.unsqueeze(-1)) #[bs, p_size, 1]
            prev_h = prev_h.squeeze(-1)  #[bs, p_size]

        output[:, t, :] = prev_h
    return output, (prev_h, prev_c)

output_custom, (h_final_custom, c_final_custom) = lstm_forward(input, (h0,c0), \
    lstm_layer.weight_ih_l0,lstm_layer.weight_hh_l0,lstm_layer.bias_ih_l0, lstm_layer.bias_hh_l0\
        ,lstm_layer.weight_hr_l0)

print(output)
print(output_custom)

tensor([[[ 0.1222, -0.2479,  0.0743],
         [ 0.0369, -0.1440, -0.1019],
         [ 0.1141, -0.0750, -0.0232]],

        [[ 0.0354,  0.0718, -0.0084],
         [ 0.1311,  0.0020, -0.0178],
         [ 0.1000,  0.0042, -0.1350]]], grad_fn=<TransposeBackward0>)
tensor([[[ 0.1222, -0.2479,  0.0743],
         [ 0.0369, -0.1440, -0.1019],
         [ 0.1141, -0.0750, -0.0232]],

        [[ 0.0354,  0.0718, -0.0084],
         [ 0.1311,  0.0020, -0.0178],
         [ 0.1000,  0.0042, -0.1350]]], grad_fn=<CopySlices>)


In [16]:
#step5 逐步实现GRU网络

#对比GRU和LSTM大小
import torch
import torch.nn as nn
lstm_layer = nn.LSTM(3, 5)
gru_layer = nn.GRU(3,5)
#在此强调parameters()是一个函数！
sum(p.numel() for p in lstm_layer.parameters()) #200
sum(p.numel() for p in gru_layer.parameters()) #150


150

In [24]:
#开始实现
def gru_forward(input, initial_states,w_ih,w_hh,b_ih, b_hh):
    prev_h = initial_states
    bs, T, i_size = input.shape
    h_size = w_ih.shape[0] // 3
    #对权重扩维，变成batch_size倍
    batch_w_ih = w_ih.unsqueeze(0).tile(bs, 1, 1)
    batch_w_hh = w_hh.unsqueeze(0).tile(bs, 1, 1)

    output = torch.zeros(bs, T, h_size)  #GRU网络输出序列

    for t in range(T):
        x = input[:,t,:]   #t时刻输入 [bs, i_size]
        w_times_x = torch.bmm(batch_w_ih, x.unsqueeze(-1)) #[bs,3*hs,1]
        w_times_x = w_times_x.squeeze(-1)

        w_times_h_prev= torch.bmm(batch_w_hh, prev_h.unsqueeze(-1)) #[bs,3*hs,1]
        w_times_h_prev = w_times_h_prev.squeeze(-1)

        r_t = torch.sigmoid(w_times_x[:,:h_size] + w_times_h_prev[:,:h_size] + b_ih[:h_size] + b_hh[:h_size])#重置门
        z_t = torch.sigmoid(w_times_x[:,h_size:2*h_size] + w_times_h_prev[:,h_size:2*h_size] \
            + b_ih[h_size:2*h_size] + b_hh[h_size:2*h_size])#更新门
        
        n_t = torch.tanh(w_times_x[:,2*h_size:3*h_size] + b_ih[2*h_size:3*h_size] +\
            r_t * (w_times_h_prev[:,2*h_size:3*h_size] + b_hh[2*h_size:3*h_size])) #候选状态

        prev_h = (1-z_t) * n_t + z_t * prev_h  #更新隐藏状态
        output[:,t,:] = prev_h
    
    return output,prev_h


#测试函数正确性
bs, T, i_size, h_size = 2, 3, 4, 5
input = torch.randn(bs, T, i_size) #输入序列
h0 = torch.randn(bs, h_size) #初始值不参与训练，api里面维度(D∗num_layers,N,Hout)

gru_layer = nn.GRU(i_size, h_size, batch_first = True)
output, h_final = gru_layer(input,h0.unsqueeze(0))
print(output)
'''
weight_ih_l0 torch.Size([15, 4])
weight_hh_l0 torch.Size([15, 5])
bias_ih_l0 torch.Size([15])
bias_hh_l0 torch.Size([15])
'''
for k, v in gru_layer.named_parameters():
    print(k, v.shape)

output_custom, h_final_custom = gru_forward(input, h0, gru_layer.weight_ih_l0, \
    gru_layer.weight_hh_l0, gru_layer.bias_ih_l0, gru_layer.bias_hh_l0)

print(output_custom)


    
    
    


tensor([[[ 0.6181,  0.6273, -1.5152, -0.1698,  1.3558],
         [-0.0634,  0.5651, -0.8531, -0.2970,  0.5613],
         [-0.1826,  0.4736, -0.4083, -0.5083,  0.3924]],

        [[-0.6443,  0.0289, -0.1807, -0.3848, -0.3714],
         [-0.3887,  0.0804, -0.3085, -0.4390, -0.1755],
         [-0.0996,  0.0738, -0.4211, -0.4593, -0.0508]]],
       grad_fn=<TransposeBackward1>)
weight_ih_l0 torch.Size([15, 4])
weight_hh_l0 torch.Size([15, 5])
bias_ih_l0 torch.Size([15])
bias_hh_l0 torch.Size([15])
tensor([[[ 0.6181,  0.6273, -1.5152, -0.1698,  1.3558],
         [-0.0634,  0.5651, -0.8531, -0.2970,  0.5613],
         [-0.1826,  0.4736, -0.4083, -0.5083,  0.3924]],

        [[-0.6443,  0.0289, -0.1807, -0.3848, -0.3714],
         [-0.3887,  0.0804, -0.3085, -0.4390, -0.1755],
         [-0.0996,  0.0738, -0.4211, -0.4593, -0.0508]]], grad_fn=<CopySlices>)
