# An implementation of sequence to sequence learning for performind addition

Build a RNN sequence to sequence (encoder-decoder) model to learn addition.

The code are pretty much copied from
https://github.com/keras-team/keras/blob/master/examples/addition_rnn.py

In [1]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))


  from ._conv import register_converters as _register_converters


In [2]:
from keras.models import Sequential
from keras import layers
import numpy as np
from six.moves import range

Using TensorFlow backend.


### define classes for one-hot encoding

In [3]:
class CharacterTable(object):
    ''' Given a set of characters:
        + Encode them into a one hot interger representation
        + Decode the one hot integer representation to their character output
        + Decode a vector of probablities to their character output
    '''
    def __init__(self, chars):
        ''' 
        # Arguments:
            chars: Characters that can apeear in the input.
        '''
        self.chars = sorted(set(chars))
        self.char_indices = dict((c,i) for i,c in enumerate(self.chars))
        self.indices_char = dict((i,c) for i,c in enumerate(self.chars))
        
    def encode(self, C, num_rows):
        '''One hot encode given string C
        #Arguments:
            num_rows: number of rows int he returned on hot encoding.
        '''
        x = np.zeros((num_rows, len(self.chars)))
        for i,c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x
    
    def decode(self, x, calc_argmax = True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[ic] for ic in x)

In [4]:
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'

## Generate training/testing data

In [5]:
# Parameters for the model and dataset.
TRAINING_SIZE = 100000
DIGITS = 3
REVERSE = True

In [6]:
# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
MAXLEN = DIGITS + 1 + DIGITS

In [7]:
chars = '0123456789+ '
ctable = CharacterTable(chars)

In [8]:
print('Generating data...')
questions = []
expected = []
seen = set()
while len(questions)<TRAINING_SIZE:
    if (len(questions)%10000==0):
        print('%d/%d data has been generated' % (len(questions),TRAINING_SIZE))
    f = lambda: int(''.join(np.random.choice(list('0123456789'))
                           for i in range(np.random.randint(1,DIGITS+1))))
    a, b = f(), f()
    
    #skip addition questions we've already seen
    key = tuple(sorted((a,b)))
    if key in seen:
        continue
    seen.add(key)
    
    #pad the data with spaces such that it is always MAXLEN
    q = '{}+{}'.format(a,b)
    query = q+' '*(MAXLEN-len(q))
    ans = str(a+b)
    ans += ''*(DIGITS+1-len(ans))
    if REVERSE:
        query = query[::-1]
    questions.append(query)
    expected.append(ans)
print('Total addition question:', len(questions))

Generating data...
0/100000 data has been generated
10000/100000 data has been generated
10000/100000 data has been generated
20000/100000 data has been generated
30000/100000 data has been generated
30000/100000 data has been generated
30000/100000 data has been generated
40000/100000 data has been generated
50000/100000 data has been generated
50000/100000 data has been generated
50000/100000 data has been generated
60000/100000 data has been generated
60000/100000 data has been generated
70000/100000 data has been generated
70000/100000 data has been generated
70000/100000 data has been generated
70000/100000 data has been generated
70000/100000 data has been generated
70000/100000 data has been generated
70000/100000 data has been generated
70000/100000 data has been generated
80000/100000 data has been generated
80000/100000 data has been generated
80000/100000 data has been generated
80000/100000 data has been generated
80000/100000 data has been generated
80000/100000 data has b

In [9]:
print('vectorization..')
x = np.zeros((len(questions), MAXLEN, len(chars)), dtype = np.bool)
y = np.zeros((len(questions), DIGITS+1, len(chars)), dtype = np.bool)
for i, sentence in enumerate(questions):
    x[i] = ctable.encode(sentence, MAXLEN)
for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, DIGITS+1)

vectorization..


In [10]:
#shuffle (x,y)
indices = np.arange(len(y))
np.random.shuffle(indices)
x = x[indices]
y = y[indices]
# Explicitly set apart 10% for validation data that we never train over.
split_at = len(x) - len(x) // 8
(x_train, x_val) = x[:split_at], x[split_at:]
(y_train, y_val) = y[:split_at], y[split_at:]


print('Training Data:')
print(x_train.shape)
print(y_train.shape)

print('Validation Data:')
print(x_val.shape)
print(y_val.shape)


Training Data:
(87500, 7, 12)
(87500, 4, 12)
Validation Data:
(12500, 7, 12)
(12500, 4, 12)


### Build models

In [11]:
# Try replacing GRU, or SimpleRNN.
RNN = layers.CuDNNLSTM
HIDDEN_SIZE = 128
BATCH_SIZE = 128
LAYERS = 2

In [12]:
print('Build model...')
model = Sequential()

#encode the input using a RNN, producing an output of HIDDEN_SIZE
# Note: In a situation where your input sequences have a variable length,
# use input_shape=(None, num_feature).
model.add(RNN(HIDDEN_SIZE, input_shape = (MAXLEN, len(chars))))

#As the decoder RNN's input, repeatedly provide with the last hidden state of
# RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
# length of output, e.g., when DIGITS=3, max output is 999+999=1998.
model.add(layers.RepeatVector(DIGITS + 1))

# The decoder RNN could be multiple layers staked or a single layer
for _ in range(LAYERS):
    model.add(RNN(HIDDEN_SIZE, return_sequences = True))

model.add(layers.TimeDistributed(layers.Dense(len(chars))))
model.add(layers.Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_1 (CuDNNLSTM)     (None, 128)               72704     
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 4, 128)            0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 4, 128)            132096    
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (None, 4, 128)            132096    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 4, 12)             1548      
_________________________________________________________________
activation_1 (Activation)    (None, 4, 12)             0         
Total params: 338,444
Trainable params: 338,444
Non-trainable params: 0
_______________________________________________________

In [None]:
for iteration in range(1,50):
    print('-'*20+'Interation:%d' % iteration + '-'*20)
    model.fit(x_train,y_train,batch_size = BATCH_SIZE,
                epochs = 10,
                validation_data = (x_val,y_val))
    
    #select 5 samples fromthe validation set at random to visualize erros
    #if iteration % 20!=0:
    #    continue
    for i in range(5):
        ind = np.random.randint(0,len(x_val))
        rowx,rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose = 0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax = False)
        print('Q', q[::-1] if REVERSE else q, end=' ')
        print('T', correct, end=' ')
        if correct == guess:
            print(colors.ok+ '☑' + colors.close, end=' ')
        else:
            print(colors.fail + '☒' + colors.close, end=' ')
        print(guess)

--------------------Interation:1--------------------
Train on 87500 samples, validate on 12500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [18]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']