# An implementation of sequence to sequence learning for performind addition

Build a RNN sequence to sequence (encoder-decoder) model to learn addition.

The code are pretty much copied from
https://github.com/keras-team/keras/blob/master/examples/addition_rnn.py

In [1]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))


In [2]:
from keras.models import Sequential
from keras import layers
import numpy as np
from six.moves import range

Using TensorFlow backend.


In [3]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

AttributeError: module 'keras.backend.tensorflow_backend' has no attribute '_get_available_gpus'

### define classes for one-hot encoding

In [4]:
class CharacterTable(object):
    ''' Given a set of characters:
        + Encode them into a one hot interger representation
        + Decode the one hot integer representation to their character output
        + Decode a vector of probablities to their character output
    '''
    def __init__(self, chars):
        ''' 
        # Arguments:
            chars: Characters that can apeear in the input.
        '''
        self.chars = sorted(set(chars))
        self.char_indices = dict((c,i) for i,c in enumerate(self.chars))
        self.indices_char = dict((i,c) for i,c in enumerate(self.chars))
        
    def encode(self, C, num_rows):
        '''One hot encode given string C
        #Arguments:
            num_rows: number of rows int he returned on hot encoding.
        '''
        x = np.zeros((num_rows, len(self.chars)))
        for i,c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x
    
    def decode(self, x, calc_argmax = True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[ic] for ic in x)

In [5]:
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'

## Generate training/testing data

In [15]:
# Parameters for the model and dataset.
TRAINING_SIZE = 50000
DIGITS = 3
REVERSE = True

In [19]:
# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
MAXLEN = DIGITS + 1 + DIGITS

In [20]:
chars = '0123456789+ '
ctable = CharacterTable(chars)

In [21]:
print('Generating data...')
questions = []
expected = []
seen = set()
while len(questions)<TRAINING_SIZE:
    if (len(questions)%10000==0):
        print('%d/%d data has been generated' % (len(questions),TRAINING_SIZE))
    f = lambda: int(''.join(np.random.choice(list('0123456789'))
                           for i in range(np.random.randint(1,DIGITS+1))))
    a, b = f(), f()
    
    #skip addition questions we've already seen
    key = tuple(sorted((a,b)))
    if key in seen:
        continue
    seen.add(key)
    
    #pad the data with spaces such that it is always MAXLEN
    q = '{}+{}'.format(a,b)
    query = q+' '*(MAXLEN-len(q))
    ans = str(a+b)
    ans += ''*(DIGITS+1-len(ans))
    if REVERSE:
        query = query[::-1]
    questions.append(query)
    expected.append(ans)
print('Total addition question:', len(questions))

Generating data...
0/50000 data has been generated
10000/50000 data has been generated
10000/50000 data has been generated
20000/50000 data has been generated
20000/50000 data has been generated
20000/50000 data has been generated
30000/50000 data has been generated
40000/50000 data has been generated
Total addition question: 50000


In [31]:
print('vectorization..')
x = np.zeros((len(questions), MAXLEN, len(chars)), dtype = np.bool)
y = np.zeros((len(questions), DIGITS+1, len(chars)), dtype = np.bool)
for i, sentence in enumerate(questions):
    x[i] = ctable.encode(sentence, MAXLEN)
for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, DIGITS+1)

vectorization..


In [32]:
#shuffle (x,y)
indices = np.arange(len(y))
np.random.shuffle(indices)
x = x[indices]
y = y[indices]
# Explicitly set apart 10% for validation data that we never train over.
split_at = len(x) - len(x) // 10
(x_train, x_val) = x[:split_at], x[split_at:]
(y_train, y_val) = y[:split_at], y[split_at:]


print('Training Data:')
print(x_train.shape)
print(y_train.shape)

print('Validation Data:')
print(x_val.shape)
print(y_val.shape)


Training Data:
(45000, 7, 12)
(45000, 4, 12)
Validation Data:
(5000, 7, 12)
(5000, 4, 12)


### Build models

In [33]:
# Try replacing GRU, or SimpleRNN.
RNN = layers.SimpleRNN
HIDDEN_SIZE = 256
BATCH_SIZE = 128
LAYERS = 1

In [34]:
print('Build model...')
model = Sequential()

#encode the input using a RNN, producing an output of HIDDEN_SIZE
# Note: In a situation where your input sequences have a variable length,
# use input_shape=(None, num_feature).
model.add(RNN(HIDDEN_SIZE, input_shape = (MAXLEN, len(chars))))

#As the decoder RNN's input, repeatedly provide with the last hidden state of
# RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
# length of output, e.g., when DIGITS=3, max output is 999+999=1998.
model.add(layers.RepeatVector(DIGITS + 1))

# The decoder RNN could be multiple layers staked or a single layer
for _ in range(LAYERS):
    model.add(RNN(HIDDEN_SIZE, return_sequences = True))

model.add(layers.TimeDistributed(layers.Dense(len(chars))))
model.add(layers.Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_3 (SimpleRNN)     (None, 256)               68864     
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 4, 256)            0         
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 4, 256)            131328    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 4, 12)             3084      
_________________________________________________________________
activation_2 (Activation)    (None, 4, 12)             0         
Total params: 203,276
Trainable params: 203,276
Non-trainable params: 0
_________________________________________________________________


In [35]:
for iteration in range(1,50):
    print('-'*20+'Interation:%d' % iteration + '-'*20)
    model.fit(x_train,y_train,batch_size = BATCH_SIZE,
                epochs = 10,
                validation_data = (x_val,y_val))
    
    #select 5 samples fromthe validation set at random to visualize erros
    #if iteration % 20!=0:
    #    continue
    for i in range(5):
        ind = np.random.randint(0,len(x_val))
        rowx,rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose = 0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax = False)
        print('Q', q[::-1] if REVERSE else q, end=' ')
        print('T', correct, end=' ')
        if correct == guess:
            print(colors.ok+ '☑' + colors.close, end=' ')
        else:
            print(colors.fail + '☒' + colors.close, end=' ')
        print(guess)

--------------------Interation:1--------------------
Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Q 66+83   T  149 [91m☒[0m 9410
Q 19+179  T  198 [91m☒[0m 8911
Q 775+78  T  853 [91m☒[0m 3581
Q 46+764  T  810 [91m☒[0m 0181
Q 775+18  T  793 [91m☒[0m 3971
--------------------Interation:2--------------------
Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Q 23+818  T  841 [91m☒[0m 1489
Q 11+119  T  130 [91m☒[0m 0310
Q 138+909 T 1047 [92m☑[0m 7401
Q 47+652  T  699 [91m☒[0m 9968
Q 0+705   T  705 [91m☒[0m 5074
--------------------Interation:3--------------------
Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Q 73+32   T  105 [91m☒[0m 591

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Q 4+232   T  236 [91m☒[0m 6323
Q 71+996  T 1067 [92m☑[0m 7601
Q 489+93  T  582 [91m☒[0m 2852
Q 901+618 T 1519 [91m☒[0m 8151
Q 94+599  T  693 [91m☒[0m 3963
--------------------Interation:7--------------------
Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Q 64+916  T  980 [91m☒[0m 0891
Q 29+490  T  519 [91m☒[0m 9151
Q 106+8   T  114 [91m☒[0m 4115
Q 656+79  T  735 [91m☒[0m 5374
Q 349+9   T  358 [91m☒[0m 8539
--------------------Interation:8--------------------
Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Q 541+4   T  545 [91m☒[0m 5454
Q 80+91   T  171 [91m☒[0m 1711
Q 997+886 T 1883 [92m☑[0m 3881
Q 125+98  T  223 [91m☒[0m 3221
Q 66+900  T  966 [91m☒[0m 6691
-

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Q 709+717 T 1426 [92m☑[0m 6241
Q 357+94  T  451 [91m☒[0m 1541
Q 41+920  T  961 [91m☒[0m 1691
Q 724+65  T  789 [91m☒[0m 9871
Q 68+63   T  131 [91m☒[0m 1311
--------------------Interation:12--------------------
Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Q 86+278  T  364 [91m☒[0m 4631
Q 93+48   T  141 [91m☒[0m 1411
Q 9+194   T  203 [91m☒[0m 3021
Q 5+458   T  463 [91m☒[0m 3644
Q 97+810  T  907 [91m☒[0m 7091
--------------------Interation:13--------------------
Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Q 978+48  T 1026 [92m☑[0m 6201
Q 399+9   T  408 [91m☒[0m 8041
Q 35+20   T   55 [91m☒[0m 5615
Q 714+71  T  785 [91m☒[0m 5871
Q 15+507  T  522 [91m☒[0m 2253
--------------------Interation:1

Epoch 10/10
Q 98+685  T  783 [91m☒[0m 3873
Q 513+8   T  521 [91m☒[0m 1251
Q 674+75  T  749 [91m☒[0m 9471
Q 8+863   T  871 [91m☒[0m 1788
Q 557+838 T 1395 [92m☑[0m 5931
--------------------Interation:17--------------------
Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
 7936/45000 [====>.........................] - ETA: 12s - loss: 8.1981e-05 - acc: 0.7828

KeyboardInterrupt: 