# sequence to sequence model with MNIST

paper : [Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation(2014)](https://arxiv.org/pdf/1406.1078.pdf)

In [0]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable

dtype = torch.FloatTensor

# Data encoding이 다 되었다고 가정하고 시작

the basic code was the project of translation model from english to french

In [0]:
# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps

char_arr = [c for c in 'SEP.1234567890(){}[]+-*/']
num_dic = {n: i for i, n in enumerate(char_arr)}

seq_data = [['9-4-(3*5+9)', '-19'], ['0*(3*3-0)-((0))+4', '4'], ['(9)-5-(4*(5*5))', '-96'], ['5*(0-(4/1+9)*2-(1-3))*(1)', '-120.0'], ['5-0/4/(1+9)+2-1', '6.0'], ['5-(0)-(4+(1-9))', '9.0']]

## 1) Parameter 세팅하기

In [0]:
# Seq2Seq Parameter
n_step = 25 # maxlen+1을 의미함
n_hidden = 128
n_class = len(num_dic)
batch_size = len(seq_data)

## 2) Preprocess the data

In [0]:
def make_batch(seq_data):
    input_batch, output_batch, target_batch = [], [], []

    for idx, seq in enumerate(seq_data):
      
      # input은 패딩, target은 한 후 padding 지점 앞 marking
      for i in range(len(seq)):
        seq[i] = seq[i] + 'P' * (n_step - len(seq[i]))
#       print(seq)
      start_output=seq[1].index('P')
      seq[1] = seq[1][:start_output] + 'E' + seq[1][(start_output+1):]

      input = [num_dic[n] for n in seq[0]]
      output = [num_dic[n] for n in ('S' + seq[1][:-1])]
      target = [num_dic[n] for n in seq[1]]

      #예시 input, output, target 프린트
      if idx==1: print("calculation: {}\ninput : {}\noutput :{}\ntarget : {}".format(seq,input, output, target))


      input_batch.append(np.eye(n_class)[input])
      output_batch.append(np.eye(n_class)[output])
      # one-hot으로 들어가는 것 아님
      target_batch.append(target)

    # make tensor
    return Variable(torch.Tensor(input_batch)), Variable(torch.Tensor(output_batch)), Variable(torch.LongTensor(target_batch))

  

In [5]:
input_batch, output_batch, target_batch = make_batch(seq_data)

calculation: ['0*(3*3-0)-((0))+4PPPPPPPP', '4EPPPPPPPPPPPPPPPPPPPPPPP']
input : [13, 22, 14, 6, 22, 6, 21, 13, 15, 21, 14, 14, 13, 15, 15, 20, 7, 2, 2, 2, 2, 2, 2, 2, 2]
output :[0, 7, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
target : [7, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [6]:
len(input_batch[0])

25

In [7]:
len(output_batch[0])

25

S: Symbol that shows starting of decoding input

E: Symbol that shows starting of decoding output

P: Symbol that will fill in blank sequence if current batch data size is short than time steps 

---

대략적인 data size에 대한 감을 익히기 위한 안내

- len(input_batch) : data_pair_size

- len(input_batch[0]) : n_step, number of step

- len(input_batch[0][0]) : |vocab|

In [8]:
print("INPUT BATCH - {}, \nINPUT BATCH LENGTH - {}".format(input_batch, len(input_batch)))
print("OUTPUT BATCH - {}, \nOUTPUT BATCH LENGTH - {}".format(output_batch, len(output_batch)))
print("TARGET BATCH - {}, \nOUTPUT BATCH LENGTH - {}".format(target_batch, len(target_batch)))

INPUT BATCH - tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [9]:
print("data size : {}".format(len(input_batch)))
print("padded sentence size : {}".format(len(input_batch[0])))
print("vocabulary size : {}".format(len(input_batch[0][0])))

data size : 6
padded sentence size : 25
vocabulary size : 24


## 3) Model 구성하기

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self):
        super(Seq2Seq, self).__init__()

        self.enc_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)
        self.dec_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)
        self.fc = nn.Linear(n_hidden, n_class)

    def forward(self, enc_input, enc_hidden, dec_input):
        enc_input = enc_input.transpose(0, 1) # enc_input: [max_len(=n_step, time step), batch_size, n_hidden]
        dec_input = dec_input.transpose(0, 1) # dec_input: [max_len(=n_step, time step), batch_size, n_hidden]

        # enc_states : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
        _, enc_states = self.enc_cell(enc_input, enc_hidden)
        # outputs : [max_len+1(=6), batch_size, num_directions(=1) * n_hidden(=128)]
        outputs, _ = self.dec_cell(dec_input, enc_states)

        model = self.fc(outputs) # model : [max_len+1(=6), batch_size, n_class]
        return model



In [11]:
model = Seq2Seq()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


  "num_layers={}".format(dropout, num_layers))


## 4) 학습 시키기

In [12]:
for epoch in range(5000):
    hidden = Variable(torch.zeros(1, batch_size, n_hidden))

    optimizer.zero_grad()
    output = model(input_batch, hidden, output_batch)
    output = output.transpose(0, 1)
    loss = 0
    for i in range(0, len(target_batch)):
        loss += criterion(output[i], target_batch[i])
    if (epoch + 1) % 1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()



Epoch: 1000 cost = 0.359929
Epoch: 2000 cost = 0.234778
Epoch: 3000 cost = 0.003447
Epoch: 4000 cost = 0.000744
Epoch: 5000 cost = 0.000318


## 5) Test하기

In [0]:
def translate(word):
    input_batch, output_batch, _ = make_batch([[word, 'P' * len(word)]])

    # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
    hidden = Variable(torch.zeros(1, 1, n_hidden))
    output = model(input_batch, hidden, output_batch)
    # output : [max_len+1, batch_size(=1), n_class]

    predict = output.data.max(2, keepdim=True)[1] # select n_class dimension
    decoded = [char_arr[i] for i in predict]
    end = decoded.index('E') if 'E' in decoded else decoded.index('P')
      
     
    translated = ''.join(decoded[:end])

    return translated.replace('P', '')


In [14]:
print('test')
print('9-4-(3*5+9) ->', translate('9-4-(3*5+9)'))
print('0*(3*3-0)-((0))+4 ->', translate('0*(3*3-0)-((0))+4'))
print('(9)-5-(4*(5*5)) ->', translate('(9)-5-(4*(5*5))'))
print('5*(0-(4/1+9)*2-(1-3))*(1) ->', translate('5*(0-(4/1+9)*2-(1-3))*(1)'))
print('5-0/4/(1+9)+2-1 ->', translate('5-0/4/(1+9)+2-1'))
print('5-(0)-(4+(1-9)) ->', translate('5-(0)-(4+(1-9))'))

test
9-4-(3*5+9) -> -19
0*(3*3-0)-((0))+4 -> 4.
(9)-5-(4*(5*5)) -> -96
5*(0-(4/1+9)*2-(1-3))*(1) -> -12
5-0/4/(1+9)+2-1 -> 6.
5-(0)-(4+(1-9)) -> 9.0


In [15]:
### 정답
import re
[re.sub('[EP]', '',
        i[1]) for i in seq_data]

['-19', '4', '-96', '-120.0', '6.0', '9.0']