In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# 1. LSTM with single data
## len(seq) = 3, dim(input)= 5

### (1) LSTM with numpy (from scratch)

\begin{array}{ll} \\
        i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
        f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
        o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
        c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
        h_t = o_t \odot \tanh(c_t) \\
    \end{array}

In [2]:
## input-to-??
W_ii = np.array([[1,0],[0,1],[1,0],[0,1],[1,1]]) # x-to-input_gate
W_if = np.array([[1,0],[0,1],[1,0],[0,1],[1,1]]) # x-to-forget_gate
W_ig = np.array([[1,0],[0,1],[1,0],[0,1],[1,1]]) # x-to-cell_state
W_io = np.array([[1,0],[0,1],[1,0],[0,1],[1,1]]) # x-to-output_gate

## hidden-to-??
W_hi = np.array([[1,0],[0,1]]) # h-to-input_gate
W_hf = np.array([[1,0],[0,1]]) # h-to-forget_gate
W_hg = np.array([[1,0],[0,1]]) # h-to-cell_state
W_ho = np.array([[1,0],[0,1]]) # h-to-output_gate

h0 = np.array([[[0,0]]])
c0 = np.array([[[0,0]]])

In [3]:
def sigmoid(x):
    """
    return sigmoid output
    """
    result = 1/(1+np.exp(-x))
    return result   

def tanh(x):
    """
    return tanh output
    """
    result = (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
    return result

In [4]:
numpy_x = np.array([[[1,1,1,1,1],
                     [2,2,2,2,2],
                     [3,3,3,3,3]]])
print(numpy_x)

[[[1 1 1 1 1]
  [2 2 2 2 2]
  [3 3 3 3 3]]]


In [5]:
# time 1
i1 = sigmoid(np.matmul(numpy_x[0][0], W_ii) + np.matmul(h0, W_hi))
f1 = sigmoid(np.matmul(numpy_x[0][0], W_if) + np.matmul(h0, W_hf))
g1 = tanh(np.matmul(numpy_x[0][0], W_ig) + np.matmul(h0, W_hg))
o1 = sigmoid(np.matmul(numpy_x[0][0], W_io) + np.matmul(h0, W_ho))

c1 = np.multiply(f1, c0) + np.multiply(i1, g1)
h1 = np.multiply(o1, tanh(c1))

# time 2
i2 = sigmoid(np.matmul(numpy_x[0][1], W_ii) + np.matmul(h1, W_hi))
f2 = sigmoid(np.matmul(numpy_x[0][1], W_if) + np.matmul(h1, W_hf))
g2 = tanh(np.matmul(numpy_x[0][1], W_ig) + np.matmul(h1, W_hg))
o2 = sigmoid(np.matmul(numpy_x[0][1], W_io) + np.matmul(h1, W_ho))

c2 = np.multiply(f2, c1) + np.multiply(i2, g2)
h2 = np.multiply(o2, tanh(c2))

# time 3
i3 = sigmoid(np.matmul(numpy_x[0][2], W_ii) + np.matmul(h2, W_hi))
f3 = sigmoid(np.matmul(numpy_x[0][2], W_if) + np.matmul(h2, W_hf))
g3 = tanh(np.matmul(numpy_x[0][2], W_ig) + np.matmul(h2, W_hg))
o3 = sigmoid(np.matmul(numpy_x[0][2], W_io) + np.matmul(h2, W_ho))

c3 = np.multiply(f3, c2) + np.multiply(i3, g3)
h3 = np.multiply(o3, tanh(c3))

In [6]:
print(h1)
print(h2)
print(h3)
print()
print(c1)
print(c2)
print(c3)

[[[0.70377533 0.70377533]]]
[[[0.95879011 0.95879011]]]
[[[0.99443796 0.99443796]]]

[[[0.94786341 0.94786341]]]
[[[1.94547472 1.94547472]]]
[[[2.94533537 2.94533537]]]


In [7]:
numpy_outs = np.concatenate((h1,h2,h3), axis=1)
numpy_hn = h3
numpy_cn = c3
print(numpy_outs)
print()
print(numpy_hn)
print()
print(numpy_cn)

[[[0.70377533 0.70377533]
  [0.95879011 0.95879011]
  [0.99443796 0.99443796]]]

[[[0.99443796 0.99443796]]]

[[[2.94533537 2.94533537]]]


In [8]:
c3

array([[[2.94533537, 2.94533537]]])

### (2) LSTM with pytorch (with higher-level LSTM class)

In [9]:
rnn = nn.LSTM(input_size=5, hidden_size=2,
              num_layers=1, bias=False, batch_first=True)

### [KOR]
- <span style = 'font-size:1.2em;line-height:1.5em'>rnn의 현재 W_xh, W_hh값은 random하게 정해져있는 상태</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>위에서 정한 W_xh, W_hh로 이를 대체하자</span>

### [ENG]
- <span style = 'font-size:1.2em;line-height:1.5em'>The initial value of each parameter (W_xh, W_hh) is randomly selected.</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>Replace these values with previously declared ones</span>

- <span style = 'font-size:1.1em;line-height:1.5em'><b>[KOR]</b> rnn에 어떤 parameter들이 있는지 확인하기</span>
- <span style = 'font-size:1.1em;line-height:1.5em'><b>[ENG]</b> Check parameter values</span>

In [10]:
for layer in rnn.state_dict():
    print(layer)
    print(rnn.state_dict()[layer].size())
    print()

weight_ih_l0
torch.Size([8, 5])

weight_hh_l0
torch.Size([8, 2])



In [11]:
W_xh = np.concatenate((W_ii, W_if, W_ig, W_io), axis=1)
print(W_xh)

[[1 0 1 0 1 0 1 0]
 [0 1 0 1 0 1 0 1]
 [1 0 1 0 1 0 1 0]
 [0 1 0 1 0 1 0 1]
 [1 1 1 1 1 1 1 1]]


In [12]:
W_hh = np.concatenate((W_hi, W_hf, W_hg, W_ho), axis=1)
print(W_hh)

[[1 0 1 0 1 0 1 0]
 [0 1 0 1 0 1 0 1]]


- <span style = 'font-size:1.1em;line-height:1.5em'><b>[KOR]</b> rnn의 parameter에 들어있던 값들을 미리 지정한 값으로 대체하기</span>
- <span style = 'font-size:1.1em;line-height:1.5em'><b>[ENG]</b> Replace parameter values of RNN with previously declared ones</span>

In [13]:
W_xh = torch.from_numpy(np.transpose(W_xh))
W_hh = torch.from_numpy(np.transpose(W_hh))

with torch.no_grad():
    rnn.weight_ih_l0 = nn.Parameter(W_xh.float())
    rnn.weight_hh_l0 = nn.Parameter(W_hh.float())

### [KOR]
- <span style = 'font-size:1.2em;line-height:1.5em'>Forward Propagation을 실행</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>hs: 각 시점의 hidden_state 값</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>out: 최종 시점의 hidden_state 값</span>

### [ENG]
- <span style = 'font-size:1.2em;line-height:1.5em'>Calculate the output value (Forward Propagation)</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>hs: The values of hidden state for each time.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>out: Final output value </span>

In [15]:
numpy_x = np.array([[[1,1,1,1,1],[2,2,2,2,2],[3,3,3,3,3]]])
torch_x = torch.Tensor(numpy_x)
print(torch_x)

tensor([[[1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3.]]])


In [16]:
torch_outs, (torch_hn, torch_cn) = rnn(torch_x)
print(torch_outs)
print(torch_hn)
print(torch_cn)

tensor([[[0.7038, 0.7038],
         [0.9588, 0.9588],
         [0.9944, 0.9944]]], grad_fn=<TransposeBackward0>)
tensor([[[0.9944, 0.9944]]], grad_fn=<StackBackward0>)
tensor([[[2.9453, 2.9453]]], grad_fn=<StackBackward0>)


### Compare between results

In [20]:
print(numpy_hn)
print(torch_hn)

[[[0.99443796 0.99443796]]]
tensor([[[0.9944, 0.9944]]], grad_fn=<StackBackward0>)


In [21]:
print(numpy_cn)
print(torch_cn)

[[[2.94533537 2.94533537]]]
tensor([[[2.9453, 2.9453]]], grad_fn=<StackBackward0>)


In [22]:
print(numpy_outs)
print(torch_outs)

[[[0.70377533 0.70377533]
  [0.95879011 0.95879011]
  [0.99443796 0.99443796]]]
tensor([[[0.7038, 0.7038],
         [0.9588, 0.9588],
         [0.9944, 0.9944]]], grad_fn=<TransposeBackward0>)


# 2. LSTM with mini-batch data
## n_data = 2, len(seq) = 3, dim(input)= 5

\begin{array}{ll} \\
        i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
        f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
        o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
        c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
        h_t = o_t \odot \tanh(c_t) \\
    \end{array}

### (1) RNN with numpy (from scratch)

In [23]:
## input-to-??
W_ii = np.array([[1,0],[0,1],[1,0],[0,1],[1,1]]) # x-to-input_gate
W_if = np.array([[1,0],[0,1],[1,0],[0,1],[1,1]]) # x-to-forget_gate
W_ig = np.array([[1,0],[0,1],[1,0],[0,1],[1,1]]) # x-to-cell_state
W_io = np.array([[1,0],[0,1],[1,0],[0,1],[1,1]]) # x-to-output_gate

## hidden-to-??
W_hi = np.array([[1,0],[0,1]]) # h-to-input_gate
W_hf = np.array([[1,0],[0,1]]) # h-to-forget_gate
W_hg = np.array([[1,0],[0,1]]) # h-to-cell_state
W_ho = np.array([[1,0],[0,1]]) # h-to-output_gate

h0 = np.array([[[0,0]]])
c0 = np.array([[[0,0]]])

In [24]:
def sigmoid(x):
    """
    return sigmoid output
    """
    result = 1/(1+np.exp(-x))
    return result   

def tanh(x):
    """
    return tanh output
    """
    result = (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
    return result

In [25]:
numpy_x = np.array([[[1,1,1,1,1],[2,2,2,2,2],[3,3,3,3,3]], 
                    [[3,3,3,3,3],[2,2,2,2,2],[1,1,1,1,1]]])
print(numpy_x)

[[[1 1 1 1 1]
  [2 2 2 2 2]
  [3 3 3 3 3]]

 [[3 3 3 3 3]
  [2 2 2 2 2]
  [1 1 1 1 1]]]


In [26]:
numpy_cn = []
numpy_hn = []
numpy_outs = []

# mini-batch의 각 data에 대해서 forward propagation 수행
for i in range(numpy_x.shape[0]): 
    # time 1
    i1 = sigmoid(np.matmul(numpy_x[i][0], W_ii) + np.matmul(h0, W_hi))
    f1 = sigmoid(np.matmul(numpy_x[i][0], W_if) + np.matmul(h0, W_hf))
    g1 = tanh(np.matmul(numpy_x[i][0], W_ig) + np.matmul(h0, W_hg))
    o1 = sigmoid(np.matmul(numpy_x[i][0], W_io) + np.matmul(h0, W_ho))
    c1 = np.multiply(f1, c0) + np.multiply(i1, g1)
    h1 = np.multiply(o1, tanh(c1))

    # time 2
    i2 = sigmoid(np.matmul(numpy_x[i][1], W_ii) + np.matmul(h1, W_hi))
    f2 = sigmoid(np.matmul(numpy_x[i][1], W_if) + np.matmul(h1, W_hf))
    g2 = tanh(np.matmul(numpy_x[i][1], W_ig) + np.matmul(h1, W_hg))
    o2 = sigmoid(np.matmul(numpy_x[i][1], W_io) + np.matmul(h1, W_ho))
    c2 = np.multiply(f2, c1) + np.multiply(i2, g2)
    h2 = np.multiply(o2, tanh(c2))

    # time 3
    i3 = sigmoid(np.matmul(numpy_x[i][2], W_ii) + np.matmul(h2, W_hi))
    f3 = sigmoid(np.matmul(numpy_x[i][2], W_if) + np.matmul(h2, W_hf))
    g3 = tanh(np.matmul(numpy_x[i][2], W_ig) + np.matmul(h2, W_hg))
    o3 = sigmoid(np.matmul(numpy_x[i][2], W_io) + np.matmul(h2, W_ho))
    c3 = np.multiply(f3, c2) + np.multiply(i3, g3)
    h3 = np.multiply(o3, tanh(c3))
    
    # 각 time마다 나온 hidden값을 concatenate
    hs = np.concatenate((h1, h2, h3), axis=1) 
    numpy_outs.append(hs)
    numpy_cn.append(c3)
    numpy_hn.append(h3)

# 각 data의 최종 hidden값들을 concatenate
numpy_hn = np.concatenate(numpy_hn, axis=1)
# 각 data의 최종 cell값들을 concatenate
numpy_cn = np.concatenate(numpy_cn, axis=1)
# 각 data에서 나온 모든 hidden값들을 concatenate
numpy_outs = np.concatenate(numpy_outs, axis=0) # 


print(numpy_hn)
print()
print(numpy_cn)
print()
print(numpy_outs)

[[[0.99443796 0.99443796]
  [0.97588385 0.97588385]]]

[[[2.94533537 2.94533537]
  [2.9409292  2.9409292 ]]]

[[[0.70377533 0.70377533]
  [0.95879011 0.95879011]
  [0.99443796 0.99443796]]

 [[0.76144835 0.76144835]
  [0.96274048 0.96274048]
  [0.97588385 0.97588385]]]


### (2) RNN with pytorch (with higher-level RNN class)

In [27]:
rnn = nn.LSTM(input_size=5, hidden_size=2,
              num_layers=1, bias=False, batch_first=True)

### [KOR]
- <span style = 'font-size:1.2em;line-height:1.5em'>rnn의 현재 W_xh, W_hh값은 random하게 정해져있는 상태</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>위에서 정한 W_xh, W_hh로 이를 대체하자</span>

### [ENG]
- <span style = 'font-size:1.2em;line-height:1.5em'>The initial value of each parameter (W_xh, W_hh) is randomly selected.</span>
- <span style = 'font-size:1.2em;line-height:1.5em'>Replace these values with previously declared ones</span>

- <span style = 'font-size:1.1em;line-height:1.5em'><b>[KOR]</b> rnn에 어떤 parameter들이 있는지 확인하기</span>
- <span style = 'font-size:1.1em;line-height:1.5em'><b>[ENG]</b> Check parameter values</span>

In [28]:
for layer in rnn.state_dict():
    print(layer)
    print(rnn.state_dict()[layer].size())
    print()

weight_ih_l0
torch.Size([8, 5])

weight_hh_l0
torch.Size([8, 2])



In [29]:
W_xh = np.concatenate((W_ii, W_if, W_ig, W_io), axis=1)
print(W_xh)

[[1 0 1 0 1 0 1 0]
 [0 1 0 1 0 1 0 1]
 [1 0 1 0 1 0 1 0]
 [0 1 0 1 0 1 0 1]
 [1 1 1 1 1 1 1 1]]


In [30]:
W_hh = np.concatenate((W_hi, W_hf, W_hg, W_ho), axis=1)
print(W_hh)

[[1 0 1 0 1 0 1 0]
 [0 1 0 1 0 1 0 1]]


- <span style = 'font-size:1.1em;line-height:1.5em'><b>[KOR]</b> rnn의 parameter에 들어있던 값들을 미리 지정한 값으로 대체하기</span>
- <span style = 'font-size:1.1em;line-height:1.5em'><b>[ENG]</b> Replace parameter values of RNN with previously declared ones</span>

In [31]:
with torch.no_grad():
    rnn.weight_ih_l0 = nn.Parameter(torch.from_numpy(np.transpose(W_xh)).float())
    rnn.weight_hh_l0 = nn.Parameter(torch.from_numpy(np.transpose(W_hh)).float())

### [KOR]
- <span style = 'font-size:1.2em;line-height:1.5em'>Forward Propagation을 실행</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>hs: 각 시점의 hidden_state 값</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>out: 최종 시점의 hidden_state 값</span>

### [ENG]
- <span style = 'font-size:1.2em;line-height:1.5em'>Calculate the output value (Forward Propagation)</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>hs: The values of hidden state for each time.</span>
    - <span style = 'font-size:1.1em;line-height:1.5em'>out: Final output value </span>

In [32]:
numpy_x = np.array([[[1,1,1,1,1],[2,2,2,2,2],[3,3,3,3,3]],
                    [[3,3,3,3,3],[2,2,2,2,2],[1,1,1,1,1]]])
torch_x = torch.Tensor(numpy_x)
print(torch_x)

tensor([[[1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3.]],

        [[3., 3., 3., 3., 3.],
         [2., 2., 2., 2., 2.],
         [1., 1., 1., 1., 1.]]])


In [37]:
torch_x.shape

torch.Size([2, 3, 5])

In [33]:
torch_outs, (torch_hn, torch_cn) = rnn(torch_x)
print(torch_hn)
print(torch_cn)
print(torch_outs)

tensor([[[0.9944, 0.9944],
         [0.9759, 0.9759]]], grad_fn=<StackBackward0>)
tensor([[[2.9453, 2.9453],
         [2.9409, 2.9409]]], grad_fn=<StackBackward0>)
tensor([[[0.7038, 0.7038],
         [0.9588, 0.9588],
         [0.9944, 0.9944]],

        [[0.7614, 0.7614],
         [0.9627, 0.9627],
         [0.9759, 0.9759]]], grad_fn=<TransposeBackward0>)


### Compare between results

In [34]:
print(numpy_hn)
print(torch_hn)

[[[0.99443796 0.99443796]
  [0.97588385 0.97588385]]]
tensor([[[0.9944, 0.9944],
         [0.9759, 0.9759]]], grad_fn=<StackBackward0>)


In [35]:
print(numpy_cn)
print(torch_cn)

[[[2.94533537 2.94533537]
  [2.9409292  2.9409292 ]]]
tensor([[[2.9453, 2.9453],
         [2.9409, 2.9409]]], grad_fn=<StackBackward0>)


In [36]:
print(numpy_outs)
print(torch_outs)

[[[0.70377533 0.70377533]
  [0.95879011 0.95879011]
  [0.99443796 0.99443796]]

 [[0.76144835 0.76144835]
  [0.96274048 0.96274048]
  [0.97588385 0.97588385]]]
tensor([[[0.7038, 0.7038],
         [0.9588, 0.9588],
         [0.9944, 0.9944]],

        [[0.7614, 0.7614],
         [0.9627, 0.9627],
         [0.9759, 0.9759]]], grad_fn=<TransposeBackward0>)
