In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM, GRU
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from __future__ import print_function
from keras.layers.core import Activation, TimeDistributedDense, RepeatVector
from keras.layers import recurrent
import numpy as np

Using Theano backend.
Using gpu device 0: GeForce GTX 980 Ti (CNMeM is disabled, CuDNN 4007)


In [1]:
class CharacterTable(object):
    def __init__(self, vocab, maxlen):
        self.vocab = vocab
        self.maxlen = maxlen
    
    def encode(self, C, maxlen=None):
        maxlen = maxlen if maxlen else self.maxlen
        X = np.zeros((maxlen, len(self.vocab)))
        for i, c in enumerate(C):
            X[i, c] = 1
        return X
    
    def decode(self, X, calc_argmax=True):
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ','.join(x for x in X)

In [3]:
def generateRandSeq(min, max, len):
    return [np.random.randint(min, max) for _ in range(len)]

In [4]:
TRAINING_SIZE = 50000
TEST_SIZE = 10000
DIGITS = 25
MAXLEN = DIGITS
voc = list(xrange(100))
ctable = CharacterTable(voc, MAXLEN)

In [6]:
inputs = []
outputs = []
inputs_t = []
outputs_t = []
print('Generating data...')
while len(inputs) < TRAINING_SIZE:
    s = generateRandSeq(0, len(voc), DIGITS)
    inputs.append(s)
    # outputs.append(s[::-1])
    outputs.append(sorted(s))

while len(inputs_t) < TEST_SIZE:
    s = generateRandSeq(0, len(voc), DIGITS)
    inputs_t.append(s)
    # outputs_t.append(s[::-1])
    outputs_t.append(sorted(s))
print(inputs[12])
print(outputs[12])

Generating data...
[42, 15, 82, 74, 60, 63, 16, 98, 10, 72, 62, 41, 14, 88, 91, 39, 28, 70, 98, 37, 14, 61, 60, 41, 57]
[10, 14, 14, 15, 16, 28, 37, 39, 41, 41, 42, 57, 60, 60, 61, 62, 63, 70, 72, 74, 82, 88, 91, 98, 98]


In [7]:
print('Vectorization...')
X = np.zeros((len(inputs), MAXLEN, len(voc)), dtype=np.bool)
y = np.zeros((len(outputs), MAXLEN, len(voc)), dtype=np.bool)
for i, sentence in enumerate(inputs):
    X[i] = ctable.encode(sentence, maxlen=MAXLEN)

for i, sentence in enumerate(outputs):
    y[i] = ctable.encode(sentence, maxlen=MAXLEN)

X_test = np.zeros((len(inputs_t), MAXLEN, len(voc)), dtype=np.bool)
y_test = np.zeros((len(outputs_t), MAXLEN, len(voc)), dtype=np.bool)
for i, sentence in enumerate(inputs_t):
    X_test[i] = ctable.encode(sentence, maxlen=MAXLEN)

for i, sentence in enumerate(outputs_t):
    y_test[i] = ctable.encode(sentence, maxlen=MAXLEN)

Vectorization...


In [8]:
print(X.shape)
print(y.shape)
print(X_test.shape)
print(y_test.shape)

(50000, 25, 100)
(50000, 25, 100)
(10000, 25, 100)
(10000, 25, 100)


In [83]:
HIDDEN_SIZE = 256
BATCH_SIZE = 200
LAYERS = 2

In [84]:
print('Build model...')
model = Sequential()
model.add(LSTM(HIDDEN_SIZE, input_shape=(MAXLEN, len(voc)), return_sequences=True))
for _ in range(LAYERS - 2):
    model.add(LSTM(HIDDEN_SIZE, return_sequences=True))

model.add(LSTM(HIDDEN_SIZE))
model.add(RepeatVector(MAXLEN))
for _ in range(LAYERS):
    model.add(LSTM(HIDDEN_SIZE, return_sequences=True))

model.add(TimeDistributedDense(len(voc)))
model.add(Activation('softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Build model...


In [None]:
hist = model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=5,
          show_accuracy=True,validation_split = 0.1, shuffle=True)

Train on 45000 samples, validate on 5000 samples
Epoch 1/5

In [78]:
hist = model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=17, callbacks=[early_stopping],
          validation_split = 0.1, shuffle=True)

score, acc = model.evaluate(X_test, y_test,
                            batch_size=BATCH_SIZE,
                            show_accuracy=True)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.000568637109245
Test accuracy: 1.0


In [70]:
test = '12345678901234567898'
x_test = ctable.encode(test, maxlen=MAXLEN)
X_test = np.zeros((1, MAXLEN, len(chars)), dtype=np.bool)
X_test[0] = x_test
res = model.predict_classes(X_test)
res[0]



array([8, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0, 0, 8, 7, 7, 5, 4, 3, 2, 1])

In [28]:
import matplotlib.pyplot as plt
axes = plt.gca()
x_min = hist.epoch[0]
x_max = hist.epoch[-1]+1
axes.set_xlim([x_min,x_max])

plt.scatter(hist.epoch, hist.history['loss'], color='g')
plt.plot(hist.history['loss'], color='g', label='Training Loss')
plt.scatter(hist.epoch, hist.history['val_loss'], color='b')
plt.plot(hist.history['val_loss'], color='b', label='Validation Loss')
plt.xlabel('epochs')
plt.ylabel('Loss')
plt.title('Training Loss & Validation Loss vs Epochs')
plt.legend()

plt.figure(2)

axes = plt.gca()
x_min = hist.epoch[0]
x_max = hist.epoch[-1]+1
axes.set_xlim([x_min,x_max])

plt.scatter(hist.epoch, hist.history['acc'], color='r')
plt.plot(hist.history['acc'], color='r', label='Training Accuracy')
plt.scatter(hist.epoch, hist.history['val_acc'], color='c')
plt.plot(hist.history['val_acc'], color='c', label='Validation Accuracy')
plt.xlabel('epochs')
plt.ylabel('Accuracy')
plt.title('Trainging Accuracy & Validation Accuracy vs Epochs')
plt.legend()

plt.show()