# sequence to sequence model with MNIST

paper : [Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation(2014)](https://arxiv.org/pdf/1406.1078.pdf)

- The encoder model takes an input and produces an encoding or a representation (ϕ) of the input, which is usually a vector.

- The goal of the encoder is to capture important properties of the input with respect to the task at hand.

- The goal of the decoder is to take the encoded input and produce a desired output.

- From this understanding of encoders and decoders, we define S2S models as encoder–decoder models in which the encoder and decoder are sequence models and the inputs and outputs are both sequences, possibly of different lengths.

모든 파라미터 capitalize

pytorch 함수 argument 설명 추가

In [2]:
import numpy as np
import torch
import torch.nn as nn
# from torch.autograd import Variable #this is deprecated - https://pytorch.org/docs/stable/autograd.html#variable-deprecated
dtype = torch.FloatTensor

# Digit recognition이 다 되었다고 가정하고 시작

the basic code was the project of translation model from english to french

In [3]:
# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps

char_arr = [c for c in 'SEP.1234567890(){}[]+-*/']
num_dic = {n: i for i, n in enumerate(char_arr)}

n_class = len(num_dic)

seq_data = [['9-4-(3*5+9)', '-19'],
            ['0*(3*3-0)-((0))+4', '4'],
            ['(9)-5-(4*(5*5))', '-96'],
            ['5*(0-(4/1+9)*2-(1-3))*(1)', '-120.0'],
            ['5-0/4/(1+9)+2-1', '6.0'],
            ['(5-(0)-(4+(1-9)*2)/2)', '9.0']]

batch_size = len(seq_data) # small data size so batch_size == data_size

## 1) Parameter 세팅하기

In [4]:
# Seq2Seq Parameter
N_STEP = 25 #maximum number of time steps
N_HIDDEN= 128

In [5]:
print(N_STEP, N_HIDDEN)

25 128


## 2) Preprocess the data

In [8]:
def make_batch(seq_data):
    input_batch, output_batch, target_batch = [], [], []

    for idx, seq in enumerate(seq_data):
      
      # input은 패딩, target은 한 후 padding 지점 앞 marking
      for i in range(len(seq)):
        seq[i] = seq[i] + 'P' * (N_STEP - len(seq[i]))
      start_output=seq[1].index('P')
      seq[1] = seq[1][:start_output] + 'E' + seq[1][(start_output+1):]

      input_data = [num_dic[n] for n in seq[0]]
      output_data = [num_dic[n] for n in ('S' + seq[1][:-1])]
      target_data = [num_dic[n] for n in seq[1]]

      #예시 input, output, target 프린트
      if idx==1: print("calculation: {}\nencoding_input(=one of input_batch) : {}\ndecoding_output(=one of output_batch) :{}\ntarget(=one of target_batch) : {}".format(seq, input_data, output_data, target_data))


      input_batch.append(np.eye(n_class)[input_data])
      output_batch.append(np.eye(n_class)[output_data])
      # one-hot으로 들어가는 것 아님
      target_batch.append(target_data)

    # make tensor
    return torch.Tensor(input_batch), torch.Tensor(output_batch), torch.LongTensor(target_batch) 

In [9]:
input_batch, output_batch, target_batch = make_batch(seq_data)

calculation: ['0*(3*3-0)-((0))+4PPPPPPPP', '4EPPPPPPPPPPPPPPPPPPPPPPP']
encoding_input(=one of input_batch) : [13, 22, 14, 6, 22, 6, 21, 13, 15, 21, 14, 14, 13, 15, 15, 20, 7, 2, 2, 2, 2, 2, 2, 2, 2]
decoding_output(=one of output_batch) :[0, 7, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
target(=one of target_batch) : [7, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


S: Symbol that shows starting of decoding input

E: Symbol that shows starting of decoding output

P: Symbol that will fill in blank sequence if current batch data size is short than time steps 

---

대략적인 data size에 대한 감을 익히기 위한 안내

- len(input_batch) : data_pair_size

- len(input_batch[0]) : N_STEP, number of step

- len(input_batch[0][0]) : |vocab|

In [10]:
print("data size : {}".format(len(input_batch)))
print("padded sentence size : {}".format(len(input_batch[0])))
print("vocabulary size : {}".format(len(input_batch[0][0])))

data size : 6
padded sentence size : 25
vocabulary size : 24


## 3) Model 구성하기

In [0]:
'''
nn.RNN()
Args:
input_size: The number of expected features in the input `x`
hidden_size: The number of features in the hidden state `h`
dropout: If non-zero, introduces a `Dropout` layer on the outputs of each RNN layer except the last layer, with dropout probability equal to
:attr:`dropout`. Default: 0'''        

class Seq2Seq(nn.Module):
    def __init__(self):
        super(Seq2Seq, self).__init__()
        # PyTorch RNN class implements the Elman(vanilla) RNN
        self.enc_cell = nn.RNN(input_size=N_CLASS, hidden_size=N_HIDDEN, dropout=0.5)
        self.dec_cell = nn.RNN(input_size=N_CLASS, hidden_size=N_HIDDEN, dropout=0.5)
        self.fc = nn.Linear(N_HIDDEN, N_CLASS)

    def forward(self, enc_input, enc_hidden, dec_input):
        # enc_input: [BATCH_SIZE, N_STEP, N_CLASS]->[N_STEP, BATCH_SIZE, N_CLASS]
        enc_input = enc_input.transpose(0, 1)
        dec_input = dec_input.transpose(0, 1)


        # RNN cell에 encoding input, encoding hidden 넣기
        # 아래 help(encoding_cell) 참고, input:input, h_0이고 output:output(=output feature, encoding에 의해 generate된 아이.), h_n
        # 즉, encoding state가 context vector라 생각하면 됨 (중요!)
        _, enc_states = self.enc_cell(enc_input, enc_hidden)
        # outputs : [max_len+1=N_STEP(=25), BATCH_SIZE, num_directions(=1) * N_HIDDEN(=128)]
        # output : context vector에 의해 산출된 decoding output, _ : hidden state
        outputs, _ = self.dec_cell(dec_input, enc_states)

        model = self.fc(outputs) # model : [max_len+1(=6), BATCH_SIZE, N_CLASS]
        return model

In [9]:
encoding_cell = nn.RNN(input_size = N_CLASS, hidden_size = N_HIDDEN, dropout=0.5)

  "num_layers={}".format(dropout, num_layers))


In [10]:
help(encoding_cell)

Help on RNN in module torch.nn.modules.rnn object:

class RNN(RNNBase)
 |  Applies a multi-layer Elman RNN with :math:`tanh` or :math:`ReLU` non-linearity to an
 |  input sequence.
 |  
 |  
 |  For each element in the input sequence, each layer computes the following
 |  function:
 |  
 |  .. math::
 |      h_t = \text{tanh}(w_{ih} x_t + b_{ih} + w_{hh} h_{(t-1)} + b_{hh})
 |  
 |  where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
 |  the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
 |  previous layer at time `t-1` or the initial hidden state at time `0`.
 |  If :attr:`nonlinearity` is `'relu'`, then `ReLU` is used instead of `tanh`.
 |  
 |  Args:
 |      input_size: The number of expected features in the input `x`
 |      hidden_size: The number of features in the hidden state `h`
 |      num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
 |          would mean stacking two RNNs together to form a `stacked RNN`,
 |          

## 4) 학습 시키기

여기서 hidden은 context vector라 생각하면 편함.

In [17]:
model = Seq2Seq()
criterion = nn.CrossEntropyLoss() ##initialize the loss function, loss-function은 cross-entropy
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01) #initialize the kind of optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)  #는 Learning rate를 바꾸면 되는 거시여따 ㅎㅎ, 학습 잘해서 함수 바꿔봄

for epoch in range(1000):
    # hidden value 초기화
    hidden = torch.zeros(1, BATCH_SIZE, N_HIDDEN)
    # 이전 interation에서 축적 됐을 수도 있으니 값을 초기화.
    optimizer.zero_grad()
    # 학습 데이터 모델에 전달(input_batch, hidden, output_batch)
    output = model(input_batch, hidden, output_batch) #context vector에 의해 최종적으로 산출된 output. 
    output = output.transpose(0, 1) # data size x padded sentence size x vocabulary size -> padded sentence size x data size x vocabulary size
    loss = 0
    # 결과와 정답 .... and element 각각 비교
    for i in range(0, len(target_batch)):
        loss += criterion(output[i], target_batch[i])
    if (epoch + 1) % 100 == 0:
        print('Epoch:', '%4d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    # propagages the loss value back through the network. 여기 두면 각각의 network가 loss function에 얼마나 영향 주는지 알 수 있음.    
    loss.backward()
    # 미분한거 업데이트 하는 function. (당연 loss function을 최소화하는 방향으로.....)
    optimizer.step()

  "num_layers={}".format(dropout, num_layers))


Epoch:  100 cost = 2.887515
Epoch:  200 cost = 2.013270
Epoch:  300 cost = 1.462841
Epoch:  400 cost = 1.145112
Epoch:  500 cost = 0.978528
Epoch:  600 cost = 0.870961
Epoch:  700 cost = 0.797935
Epoch:  800 cost = 0.745006
Epoch:  900 cost = 0.703568
Epoch: 1000 cost = 0.668529


In [12]:
help(model)

Help on Seq2Seq in module __main__ object:

class Seq2Seq(torch.nn.modules.module.Module)
 |  Base class for all neural network modules.
 |  
 |  Your models should also subclass this class.
 |  
 |  Modules can also contain other Modules, allowing to nest them in
 |  a tree structure. You can assign the submodules as regular attributes::
 |  
 |      import torch.nn as nn
 |      import torch.nn.functional as F
 |  
 |      class Model(nn.Module):
 |          def __init__(self):
 |              super(Model, self).__init__()
 |              self.conv1 = nn.Conv2d(1, 20, 5)
 |              self.conv2 = nn.Conv2d(20, 20, 5)
 |  
 |          def forward(self, x):
 |             x = F.relu(self.conv1(x))
 |             return F.relu(self.conv2(x))
 |  
 |  Submodules assigned in this way will be registered, and will have their
 |  parameters converted too when you call :meth:`to`, etc.
 |  
 |  Method resolution order:
 |      Seq2Seq
 |      torch.nn.modules.module.Module
 |      builtins

In [13]:
help(criterion)

Help on CrossEntropyLoss in module torch.nn.modules.loss object:

class CrossEntropyLoss(_WeightedLoss)
 |  This criterion combines :func:`nn.LogSoftmax` and :func:`nn.NLLLoss` in one single class.
 |  
 |  It is useful when training a classification problem with `C` classes.
 |  If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
 |  assigning weight to each of the classes.
 |  This is particularly useful when you have an unbalanced training set.
 |  
 |  The `input` is expected to contain scores for each class.
 |  
 |  `input` has to be a Tensor of size either :math:`(minibatch, C)` or
 |  :math:`(minibatch, C, d_1, d_2, ..., d_K)`
 |  with :math:`K \geq 2` for the `K`-dimensional case (described later).
 |  
 |  This criterion expects a class index (0 to `C-1`) as the
 |  `target` for each value of a 1D tensor of size `minibatch`
 |  
 |  The loss can be described as:
 |  
 |  .. math::
 |      \text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \e

## 5) Test하기

In [0]:
def translate(word):
    # input, output, target, target은 pad로 가득 채워서 (즉 난 아무것도 모른다)
    input_batch, output_batch, _ = make_batch([[word, 'P' * len(word)]])

    # make hidden shape [num_layers * num_directions, BATCH_SIZE, N_HIDDEN]
    hidden = torch.zeros(1, 1, N_HIDDEN) 
    output = model(input_batch, hidden, output_batch)
    # output : [max_len+1, BATCH_SIZE(=1), N_CLASS]

    predict = output.data.max(2, keepdim=True)[1] # select N_CLASS dimension
    decoded = [char_arr[i] for i in predict]
    print(decoded)
    end = decoded.index('E') if 'E' in decoded else decoded.index('P')
      
     
    translated = ''.join(decoded[:end])

    return translated.replace('P', '')


In [19]:
print('test')
print('9-4-(3*5+9) ->', translate('9-4-(3*5+9)'))
print('0*(3*3-0)-((0))+4 ->', translate('0*(3*3-0)-((0))+4'))
print('(9)-5-(4*(5*5)) ->', translate('(9)-5-(4*(5*5))'))
print('5*(0-(4/1+9)*2-(1-3))*(1) ->', translate('5*(0-(4/1+9)*2-(1-3))*(1)'))
print('5-0/4/(1+9)+2-1 ->', translate('5-0/4/(1+9)+2-1'))
print('5-(0)-(4+(1-9)) ->', translate('5-(0)-(4+(1-9))'))

test
['-', '.', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P']
9-4-(3*5+9) -> -.
['-', '.', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P']
0*(3*3-0)-((0))+4 -> -.
['-', '.', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P']
(9)-5-(4*(5*5)) -> -.
['-', '1', '2', '0', '.', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P']
5*(0-(4/1+9)*2-(1-3))*(1) -> -120.
['-', '.', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P']
5-0/4/(1+9)+2-1 -> -.
['-', '.', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', 'P']
5-(0)-(4+(1-9)) -> -.


In [16]:
### 정답
import re
[re.sub('[EP]', '',
        i[1]) for i in seq_data]

['-19', '4', '-96', '-120.0', '6.0', '9.0']