In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM, GRU
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from __future__ import print_function
from keras.layers.core import Activation, TimeDistributedDense, RepeatVector
from keras.layers import recurrent
import numpy as np

Using Theano backend.
Using gpu device 0: GeForce GTX 980 Ti (CNMeM is disabled, CuDNN 4007)


In [3]:
class CharacterTable(object):
    def __init__(self, vocab, maxlen):
        self.vocab = vocab
        self.maxlen = maxlen
    
    def encode(self, C, maxlen=None):
        maxlen = maxlen if maxlen else self.maxlen
        X = np.zeros((maxlen, len(self.vocab)))
        for i, c in enumerate(C):
            X[i, c] = 1
        return X
    
    def decode(self, X, calc_argmax=True):
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ','.join(x for x in X)
    
def generateRandSeq(min, max, len):
    return [np.random.randint(min, max) for _ in range(len)]

In [4]:
TRAINING_SIZE = 150000
TEST_SIZE = 10000
DIGITS = 25
MAXLEN = DIGITS
voc = list(xrange(1000))
ctable = CharacterTable(voc, MAXLEN)

In [6]:
inputs = []
outputs = []
inputs_t = []
outputs_t = []
print('Generating data...')
while len(inputs) < TRAINING_SIZE:
    s = generateRandSeq(0, len(voc), DIGITS)
    inputs.append(s)
    outputs.append([i%200 for i in s])

while len(inputs_t) < TEST_SIZE:
    s = generateRandSeq(0, len(voc), DIGITS)
    inputs_t.append(s)
    outputs_t.append([i%200 for i in s])
print(inputs[12])
print(outputs[12])

Generating data...
[829, 33, 548, 366, 280, 869, 26, 593, 809, 925, 622, 194, 14, 537, 493, 831, 691, 213, 378, 348, 980, 209, 907, 537, 132]
[29, 33, 148, 166, 80, 69, 26, 193, 9, 125, 22, 194, 14, 137, 93, 31, 91, 13, 178, 148, 180, 9, 107, 137, 132]


In [7]:
print('Vectorization...')
X = np.zeros((len(inputs), MAXLEN, len(voc)), dtype=np.bool)
y = np.zeros((len(outputs), MAXLEN, len(voc)), dtype=np.bool)
for i, sentence in enumerate(inputs):
    X[i] = ctable.encode(sentence, maxlen=MAXLEN)

for i, sentence in enumerate(outputs):
    y[i] = ctable.encode(sentence, maxlen=MAXLEN)

X_test = np.zeros((len(inputs_t), MAXLEN, len(voc)), dtype=np.bool)
y_test = np.zeros((len(outputs_t), MAXLEN, len(voc)), dtype=np.bool)
for i, sentence in enumerate(inputs_t):
    X_test[i] = ctable.encode(sentence, maxlen=MAXLEN)

for i, sentence in enumerate(outputs_t):
    y_test[i] = ctable.encode(sentence, maxlen=MAXLEN)
    
print(X.shape)
print(y.shape)
print(X_test.shape)
print(y_test.shape)

Vectorization...
(150000, 25, 1000)
(150000, 25, 1000)
(10000, 25, 1000)
(10000, 25, 1000)


In [8]:
HIDDEN_SIZE = 256
BATCH_SIZE = 200
LAYERS = 2

print('Build model...')
model = Sequential()
model.add(LSTM(HIDDEN_SIZE, input_shape=(MAXLEN, len(voc)), return_sequences=True))
for _ in range(LAYERS - 2):
    model.add(LSTM(HIDDEN_SIZE, return_sequences=True))

model.add(LSTM(HIDDEN_SIZE))
model.add(RepeatVector(MAXLEN))
for _ in range(LAYERS):
    model.add(LSTM(HIDDEN_SIZE, return_sequences=True))

model.add(TimeDistributedDense(len(voc)))
model.add(Activation('softmax'))

model.compile(optimizer='RMSprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Build model...




In [9]:
early_stopping = EarlyStopping(monitor='val_loss', patience=6)
hist = model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=150, 
                 callbacks=[early_stopping],
          validation_split = 0.1, shuffle=True)

score, acc = model.evaluate(X_test, y_test,
                            batch_size=BATCH_SIZE,
                            show_accuracy=True)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 135000 samples, validate on 15000 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
 13200/135000 [=>............................] - ETA: 407s - loss: 5.2697 - acc: 0.0060

KeyboardInterrupt: 

In [64]:
json_string = model.to_json()
open('lstm_100_128_10k_model.json', 'w').write(json_string)
model.save_weights('lstm_100_128_10k_weights.h5')

In [67]:
from keras.models import model_from_json
model2 = model_from_json(open('lstm_100_128_10k_model.json').read())
model2.load_weights('lstm_100_128_10k_weights.h5')

In [70]:
model2.compile(optimizer='RMSprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
hist = model2.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=2, 
                 callbacks=[early_stopping],
          validation_split = 0.1, shuffle=True)

score, acc = model2.evaluate(X_test, y_test,
                            batch_size=BATCH_SIZE,
                            show_accuracy=True)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 9000 samples, validate on 1000 samples
Epoch 1/2
Epoch 2/2
Test score: 1.28102155685
Test accuracy: 0.564908002615
