In [21]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM, GRU
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from __future__ import print_function
from keras.layers.core import Activation, TimeDistributedDense, RepeatVector
from keras.layers import recurrent
import numpy as np

In [3]:
class CharacterTable(object):
    '''
    Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilties to their character output
    '''
    def __init__(self, chars, maxlen):
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
        self.maxlen = maxlen

    def encode(self, C, maxlen=None):
        maxlen = maxlen if maxlen else self.maxlen
        X = np.zeros((maxlen, len(self.chars)))
        for i, c in enumerate(C):
            X[i, self.char_indices[c]] = 1
        return X

    def decode(self, X, calc_argmax=True):
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in X)

In [7]:
def generate(digits):
    return ''.join([str(np.random.randint(0, 9)) for _ in range(digits)])

In [79]:
TRAINING_SIZE = 50000
TEST_SIZE = 10000
DIGITS = 50
MAXLEN = DIGITS
chars = '0123456789'
ctable = CharacterTable(chars, MAXLEN)

In [80]:
inputs = []
outputs = []
inputs_t = []
outputs_t = []
print('Generating data...')
while len(inputs) < TRAINING_SIZE:
    s = generate(DIGITS)
    inputs.append(s)
    outputs.append(s[::-1])
    
while len(inputs_t) < TEST_SIZE:
    s = generate(DIGITS)
    inputs_t.append(s)
    outputs_t.append(s[::-1])
print(inputs[12])
print(outputs[12])
print(inputs_t[12])
print(outputs_t[12])

Generating data...
43475457570151080538413805578265851041648730143840
04834103784614015856287550831483508015107575457434
12548177470071435851317678881702480772000564513724
42731546500027708420718887671315853417007477184521


In [81]:
print('Vectorization...')
X = np.zeros((len(inputs), MAXLEN, len(chars)), dtype=np.bool)
y = np.zeros((len(outputs), MAXLEN, len(chars)), dtype=np.bool)
for i, sentence in enumerate(inputs):
    X[i] = ctable.encode(sentence, maxlen=MAXLEN)
for i, sentence in enumerate(outputs):
    y[i] = ctable.encode(sentence, maxlen=MAXLEN)
    
X_test = np.zeros((len(inputs_t), MAXLEN, len(chars)), dtype=np.bool)
y_test = np.zeros((len(outputs_t), MAXLEN, len(chars)), dtype=np.bool)
for i, sentence in enumerate(inputs_t):
    X_test[i] = ctable.encode(sentence, maxlen=MAXLEN)
for i, sentence in enumerate(outputs_t):
    y_test[i] = ctable.encode(sentence, maxlen=MAXLEN)
    
print(X[12])

Vectorization...
[[False False False False  True False False False False False]
 [False False False  True False False False False False False]
 [False False False False  True False False False False False]
 [False False False False False False False  True False False]
 [False False False False False  True False False False False]
 [False False False False  True False False False False False]
 [False False False False False  True False False False False]
 [False False False False False False False  True False False]
 [False False False False False  True False False False False]
 [False False False False False False False  True False False]
 [ True False False False False False False False False False]
 [False  True False False False False False False False False]
 [False False False False False  True False False False False]
 [False  True False False False False False False False False]
 [ True False False False False False False False False False]
 [False False False False False False 

In [82]:
indices = np.arange(len(y))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]
print(X.shape)
print(y.shape)
print(X_test.shape)
print(y_test.shape)

(50000, 50, 10)
(50000, 50, 10)
(10000, 50, 10)
(10000, 50, 10)


In [83]:
HIDDEN_SIZE = 128
BATCH_SIZE = 100
LAYERS = 2

In [84]:
print('Build model...')
model = Sequential()
# dropout_W=0.1, dropout_U=0.1,
model.add(LSTM(HIDDEN_SIZE, input_shape=(MAXLEN, len(chars)), return_sequences=True))
for _ in range(LAYERS - 2):
    model.add(LSTM(HIDDEN_SIZE, return_sequences=True))
model.add(LSTM(HIDDEN_SIZE))
model.add(RepeatVector(MAXLEN))
for _ in range(LAYERS):
    model.add(LSTM(HIDDEN_SIZE, return_sequences=True))

model.add(TimeDistributedDense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam')

Build model...


In [None]:
hist = model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=5,
          show_accuracy=True,validation_split = 0.1, shuffle=True)

Train on 45000 samples, validate on 5000 samples
Epoch 1/5

In [78]:
score, acc = model.evaluate(X_test, y_test,
                            batch_size=BATCH_SIZE,
                            show_accuracy=True)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.000568637109245
Test accuracy: 1.0


In [70]:
test = '12345678901234567898'
x_test = ctable.encode(test, maxlen=MAXLEN)
X_test = np.zeros((1, MAXLEN, len(chars)), dtype=np.bool)
X_test[0] = x_test
res = model.predict_classes(X_test)
res[0]



array([8, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0, 0, 8, 7, 7, 5, 4, 3, 2, 1])

In [28]:
import matplotlib.pyplot as plt
axes = plt.gca()
x_min = hist.epoch[0]
x_max = hist.epoch[-1]+1
axes.set_xlim([x_min,x_max])

plt.scatter(hist.epoch, hist.history['loss'], color='g')
plt.plot(hist.history['loss'], color='g', label='Training Loss')
plt.scatter(hist.epoch, hist.history['val_loss'], color='b')
plt.plot(hist.history['val_loss'], color='b', label='Validation Loss')
plt.xlabel('epochs')
plt.ylabel('Loss')
plt.title('Training Loss & Validation Loss vs Epochs')
plt.legend()

plt.figure(2)

axes = plt.gca()
x_min = hist.epoch[0]
x_max = hist.epoch[-1]+1
axes.set_xlim([x_min,x_max])

plt.scatter(hist.epoch, hist.history['acc'], color='r')
plt.plot(hist.history['acc'], color='r', label='Training Accuracy')
plt.scatter(hist.epoch, hist.history['val_acc'], color='c')
plt.plot(hist.history['val_acc'], color='c', label='Validation Accuracy')
plt.xlabel('epochs')
plt.ylabel('Accuracy')
plt.title('Trainging Accuracy & Validation Accuracy vs Epochs')
plt.legend()

plt.show()