# Solve addition as Question-Answer problem using Encoder-Decoder

## Create datasets

In [1]:
import numpy as np

In [2]:
def n(digits=3):
    number = ''
    for i in range(np.random.randint(1, digits+1)):
        number += np.random.choice(list('0123456789'))
    return int(number)

In [3]:
n()

468

In [4]:
a, b = n(), n()
question = '{}+{}'.format(a, b)

In [5]:
question

'8+877'

Use padding to get fixed length.

In [6]:
def padding(chars, maxlen):
    return chars + ' ' * (maxlen - len(chars))

In [7]:
digits = 3

In [8]:
input_digits = digits * 2 + 1 #3-digit '+' 3-digit

question = '{}+{}'.format(a, b)
question = padding(question, input_digits)

In [9]:
question

'8+877  '

Generate questions and answers.

In [10]:
digits = 3
input_digits = digits * 2 + 1
output_digits = digits + 1 #maximum 4-digits

added = set()
questions = []
answers = []

N = 20000

for i in range(N):
    a, b = n(), n()
    pair = tuple(sorted((a, b))) #To eliminate redundancy
    
    #Pick up only non-existing question
    if pair in added:
        continue
    
    question = '{}+{}'.format(a, b)
    question = padding(question, input_digits)
    answer = str(a+b)
    answer = padding(answer, output_digits)
    
    added.add(pair)
    questions.append(question)
    answers.append(answer)

In [11]:
questions[0:5]

['902+46 ', '409+439', '7+123  ', '45+6   ', '30+575 ']

In [12]:
answers[0:5]

['948 ', '848 ', '130 ', '51  ', '605 ']

One hot encoding.

In [13]:
chars = '0123456789+ '
char_indices = dict((c,i) for i,c in enumerate(chars))
indices_char = dict((i,c) for i,c in enumerate(chars))

In [14]:
char_indices

{' ': 11,
 '+': 10,
 '0': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9}

In [15]:
X = np.zeros((len(questions), input_digits, len(chars)), dtype=np.integer)
Y = np.zeros((len(questions), output_digits, len(chars)), dtype=np.integer)

In [16]:
X.shape

(11750, 7, 12)

In [17]:
for i in range(len(questions)):
    for t, char in enumerate(questions[i]):
        X[i, t, char_indices[char]] = 1
    for t, char in enumerate(answers[i]):
        Y[i, t, char_indices[char]] = 1

In [18]:
X[0,...]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [19]:
Y[0,...]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

Split total data into training and testing.

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, train_size=10000)



In [22]:
X_train.shape

(10000, 7, 12)

In [23]:
X_validation.shape

(1750, 7, 12)

## Encoder-Decoder Tensorflow implementation

In [24]:
import tensorflow as tf
from tensorflow.contrib import rnn

In [25]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [26]:
def inference(x, y, n_batch, is_training, input_digits=None, output_digits=None, n_hidden=None, n_out=None):
    #Encoder
    encoder = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    state = encoder.zero_state(n_batch, tf.float32)
    
    encoder_outputs =[]
    encoder_states = []
    
    with tf.variable_scope('Encoder'):
        for t in range(input_digits):
            if t > 0:
                tf.get_variable_scope().reuse_variables() #reuse variables for t > 0 steps
            (output, state) = encoder(x[:, t, :], state) #encoder(batch, input, hidden)
            encoder_outputs.append(output)
            encoder_states.append(state)
    
    #Decoder
    decoder = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    state = encoder_states[-1] #Last state of encoder: this is for the initial input for decoder
    decoder_outputs = [encoder_outputs[-1]] #Last output of encoder: this is for the initial input for decoder
    
    V = weight_variable([n_hidden, n_out])
    c = bias_variable([n_out])
    outputs = [] #List for storing predictions
    
    with tf.variable_scope('Decoder'):
        for t in range(1, output_digits):
            if t > 1:
                tf.get_variable_scope().reuse_variables()
            
            if is_training is True:
                (output, state) = decoder(y[:, t-1, :], state) #decoder's input is the previous time step
            else:
                linear = tf.matmul(decoder_outputs[-1], V) + c
                out = tf.nn.softmax(linear)
                outputs.append(out)
                out = tf.one_hot(tf.argmax(out, -1), depth=output_digits)
                (output, state) = decoder(out, state)
            
            decoder_outputs.append(output)
    
    if is_training is True:
        output = tf.reshape(tf.concat(decoder_outputs, axis=1), [-1, output_digits, n_hidden])
        linear = tf.einsum('ijk,kl->ijl', output, V) + c #output dim: (batch, output_digits, n_hidden)
        
        return tf.nn.softmax(linear)
    else:
        linear = tf.matmul(decoder_outputs[-1], V) + c
        out = tf.nn.softmax(linear)
        outputs.append(out)
        output = tf.reshape(tf.concat(outputs, axis=1), [-1, output_digits, n_out])
        
        return output

In [27]:
def loss(y, t):
    cross_entropy = tf.reduce_mean(
        -tf.reduce_sum(t * tf.log(tf.clip_by_value(y, 1e-10, 1.0)), reduction_indices=[1]))
    
    return cross_entropy

In [28]:
def training(loss):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999)
    train_step = optimizer.minimize(loss)
    
    return train_step

In [29]:
def accuracy(y, t):
    correct_prediction = tf.equal(tf.argmax(y, -1), tf.argmax(t, -1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    return accuracy

## Training

In [30]:
n_in = len(chars)
n_hidden = 128
n_out = len(chars)

x = tf.placeholder(tf.float32, shape=[None, input_digits, n_in])
t = tf.placeholder(tf.float32, shape=[None, output_digits, n_out])
n_batch = tf.placeholder(tf.int32, shape=[]) #Need explicit shape
is_training = tf.placeholder(tf.bool, shape=[])

y = inference(x, t, n_batch, is_training
              , input_digits=input_digits, output_digits=output_digits, n_hidden=n_hidden, n_out=n_out)

loss = loss(y, t)
train_step = training(loss)

acc = accuracy(y, t)

In [31]:
epochs = 200

batch_size = 16
n_batches = len(X_train) // batch_size

N_validation = len(X_validation)

In [32]:
sess = tf.Session()

init = tf.global_variables_initializer()
sess.run(init)

In [33]:
from sklearn.utils import shuffle

In [34]:
for epoch in range(epochs):
    X_, Y_ = shuffle(X_train, Y_train)
    
    for i in range(n_batches):
        start = i * batch_size
        end = start + batch_size
        
        sess.run(train_step, feed_dict={x:X_[start:end], t:Y_[start:end], n_batch:batch_size, is_training:True})
    
    val_loss = loss.eval(session=sess
                         , feed_dict={x:X_validation, t:Y_validation, n_batch:N_validation, is_training:False})
    val_acc = acc.eval(session=sess
                       , feed_dict={x:X_validation, t:Y_validation, n_batch:N_validation, is_training:False})
    
    print("epoch: {}, val_loss: {:.4f}, val_acc {:.4f}".format(epoch, val_loss, val_acc))
    
    
    #Print prediction results for each 10 epoch
    if epoch % 10 == 0:
        for i in range(10):
            index = np.random.randint(0, N_validation)
            
            question = X_validation[np.array([index])]
            answer = Y_validation[np.array([index])]
            prediction = y.eval(session=sess,
                               feed_dict={x:question, n_batch:1, is_training:False})
            
            question = question.argmax(axis=-1)
            answer = answer.argmax(axis=-1)
            prediction = prediction.argmax(axis=-1)
            
            q = ''.join(indices_char[i] for i in question[0])
            a = ''.join(indices_char[i] for i in answer[0])
            p = ''.join(indices_char[i] for i in prediction[0])
            
            print('-' * 40)
            print('Question:', q, ' Prediction:', p, ' T/F:', end=' ')
            if a == p:
                print('T')
            else:
                print('F')

epoch: 0, val_loss: 0.5688, val_acc 0.3853
----------------------------------------
Question: 94+88    Prediction: 909   T/F: F
----------------------------------------
Question: 8+966    Prediction: 109   T/F: F
----------------------------------------
Question: 442+931  Prediction: 444   T/F: F
----------------------------------------
Question: 591+77   Prediction: 109   T/F: F
----------------------------------------
Question: 16+27    Prediction: 72    T/F: F
----------------------------------------
Question: 656+781  Prediction: 166   T/F: F
----------------------------------------
Question: 8+76     Prediction: 77    T/F: F
----------------------------------------
Question: 3+354    Prediction: 44    T/F: F
----------------------------------------
Question: 673+21   Prediction: 732   T/F: F
----------------------------------------
Question: 982+249  Prediction: 100   T/F: F
epoch: 1, val_loss: 0.5428, val_acc 0.4021
epoch: 2, val_loss: 0.5146, val_acc 0.4286
epoch: 3, val_loss: 0

epoch: 61, val_loss: 0.2267, val_acc 0.8281
epoch: 62, val_loss: 0.2446, val_acc 0.8209
epoch: 63, val_loss: 0.2403, val_acc 0.8134
epoch: 64, val_loss: 0.2343, val_acc 0.8230
epoch: 65, val_loss: 0.2336, val_acc 0.8251
epoch: 66, val_loss: 0.2314, val_acc 0.8287
epoch: 67, val_loss: 0.2359, val_acc 0.8281
epoch: 68, val_loss: 0.2385, val_acc 0.8274
epoch: 69, val_loss: 0.3695, val_acc 0.7507
epoch: 70, val_loss: 0.2450, val_acc 0.8101
----------------------------------------
Question: 1+153    Prediction: 154   T/F: T
----------------------------------------
Question: 37+6     Prediction: 44    T/F: F
----------------------------------------
Question: 79+89    Prediction: 168   T/F: T
----------------------------------------
Question: 83+40    Prediction: 123   T/F: T
----------------------------------------
Question: 720+646  Prediction: 1066  T/F: F
----------------------------------------
Question: 4+33     Prediction: 38    T/F: F
----------------------------------------
Question:

epoch: 131, val_loss: 0.2915, val_acc 0.8200
epoch: 132, val_loss: 0.2999, val_acc 0.8151
epoch: 133, val_loss: 0.2988, val_acc 0.8163
epoch: 134, val_loss: 0.2907, val_acc 0.8213
epoch: 135, val_loss: 0.3093, val_acc 0.8104
epoch: 136, val_loss: 0.2908, val_acc 0.8241
epoch: 137, val_loss: 0.2988, val_acc 0.8217
epoch: 138, val_loss: 0.2829, val_acc 0.8300
epoch: 139, val_loss: 0.2831, val_acc 0.8291
epoch: 140, val_loss: 0.2844, val_acc 0.8293
----------------------------------------
Question: 369+35   Prediction: 394   T/F: F
----------------------------------------
Question: 65+29    Prediction: 93    T/F: F
----------------------------------------
Question: 503+351  Prediction: 865   T/F: F
----------------------------------------
Question: 10+78    Prediction: 88    T/F: T
----------------------------------------
Question: 656+17   Prediction: 683   T/F: F
----------------------------------------
Question: 10+5     Prediction: 15    T/F: T
----------------------------------------