In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
from keras.layers import LSTM, GRU
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from __future__ import print_function
from keras.layers.core import Activation, TimeDistributedDense, RepeatVector
from keras.layers import recurrent
import numpy as np

Using Theano backend.
Using gpu device 0: GeForce GTX 980 Ti (CNMeM is disabled, CuDNN 4007)


In [2]:
class CharacterTable(object):
    def __init__(self, vocab, maxlen):
        self.vocab = vocab
        self.maxlen = maxlen
    
    def encode(self, C, maxlen=None):
        maxlen = maxlen if maxlen else self.maxlen
        X = np.zeros((maxlen, len(self.vocab)))
        for i, c in enumerate(C):
            X[i, c] = 1
        return X
    
    def decode(self, X, calc_argmax=True):
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ','.join(x for x in X)
    
def generateRandSeq(min, max, len):
    return [np.random.randint(min, max) for _ in range(len)]

In [6]:
TRAINING_SIZE = 150000
TEST_SIZE = 10000
DIGITS = 25
MAXLEN = DIGITS
voc = list(xrange(10))
ctable = CharacterTable(voc, MAXLEN)

In [7]:
inputs = []
outputs = []
inputs_t = []
outputs_t = []
print('Generating data...')
while len(inputs) < TRAINING_SIZE:
    s = generateRandSeq(0, len(voc), DIGITS)
    inputs.append(s)
    # outputs.append(s[::-1])
    outputs.append(sorted(s))

while len(inputs_t) < TEST_SIZE:
    s = generateRandSeq(0, len(voc), DIGITS)
    inputs_t.append(s)
    # outputs_t.append(s[::-1])
    outputs_t.append(sorted(s))
print(inputs[12])
print(outputs[12])

Generating data...
[3, 2, 9, 7, 8, 1, 9, 5, 2, 6, 6, 9, 6, 7, 4, 6, 2, 9, 9, 9, 3, 9, 1, 1, 1]
[1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6, 6, 6, 6, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9]


In [8]:
print('Vectorization...')
X = np.zeros((len(inputs), MAXLEN), dtype=np.int32)
# y = np.zeros((len(outputs), MAXLEN), dtype=np.int32)
y = np.zeros((len(outputs), MAXLEN, len(voc)), dtype=np.bool)
for i, sentence in enumerate(inputs):
    X[i] = inputs[i]

# for i, sentence in enumerate(outputs):
#     y[i] = outputs[i]
for i, sentence in enumerate(outputs):
    y[i] = ctable.encode(sentence, maxlen=MAXLEN)

X_test = np.zeros((len(inputs_t), MAXLEN), dtype=np.int32)
# y_test = np.zeros((len(outputs_t), MAXLEN), dtype=np.int32)
y_test = np.zeros((len(outputs_t), MAXLEN, len(voc)), dtype=np.bool)
for i, sentence in enumerate(inputs_t):
    X_test[i] = inputs_t[i]

# for i, sentence in enumerate(outputs_t):
#     y_test[i] = outputs_t[i]
for i, sentence in enumerate(outputs_t):
    y_test[i] = ctable.encode(sentence, maxlen=MAXLEN)
    
print(X.shape)
print(y.shape)
print(X_test.shape)
print(y_test.shape)

Vectorization...
(150000, 25)
(150000, 25, 10)
(10000, 25)
(10000, 25, 10)


In [9]:
HIDDEN_SIZE = 256
BATCH_SIZE = 200
LAYERS = 2
'''
Hey guys, I also met this problem and I found this thread. Basically, 
the error info can happen when the dimension of the input data (X_train or Y_train) doesn't match with the 
model's input shape.

In my case (and @LeavesBreathe 's case I guess), the problem is that 
the model is expecting the Y_train to be a 3d tensor. Because of the embedding layer, 
the 2d tensor X_train of size (n_batch, sequence_length) will be eventually converted to a 3d tensor of size 
(n_batch, sequence_length, embedding_size) and will be processed by the succeeding LSTM layer. However, 
the 2d tensor Y_train of size (n_sample, sequence_length) is not converted to 3d, 
which is needed by the decoder LSTM.

To fix this problem, what I did is to convert Y_train into a 3d binary tensor (binary one-hot coding) and it worked.
'''
print('Build model...')
model = Sequential()
model.add(Embedding(len(voc), 300, input_length = MAXLEN))
model.add(LSTM(HIDDEN_SIZE, return_sequences=True))
for _ in range(LAYERS - 2):
    model.add(LSTM(HIDDEN_SIZE, return_sequences=True))

model.add(LSTM(HIDDEN_SIZE))
model.add(RepeatVector(MAXLEN))
for _ in range(LAYERS):
    model.add(LSTM(HIDDEN_SIZE, return_sequences=True))

model.add(TimeDistributedDense(input_dim=HIDDEN_SIZE, output_dim=300))
model.add(Activation('tanh'))
model.add(TimeDistributedDense(input_dim=300, output_dim=len(voc)))
model.add(Activation('softmax'))

model.compile(optimizer='RMSprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Build model...




In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
hist = model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=150, 
                 callbacks=[early_stopping],
          validation_split = 0.1, shuffle=True)

score, acc = model.evaluate(X_test, y_test,
                            batch_size=BATCH_SIZE,
                            show_accuracy=True)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 135000 samples, validate on 15000 samples
Epoch 1/150

In [32]:
from keras import backend as K
def get_activations(model, layer, X_batch):
    get_activations = K.function([model.layers[0].input, K.learning_phase()], [model.layers[layer].output,])
    activations = get_activations([X_batch,0])
    return activations

In [36]:
X_voc = np.zeros((1, MAXLEN), dtype=np.int32)
X_voc[0] = range(0,10) + [0]*15
X_voc

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
        42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
        67, 68, 69, 70, 71, 72, 73, 74],
       [75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
        92, 93, 94, 95, 96, 97, 98, 99]], dtype=int32)

In [40]:
print(model.layers[0])
print(model.layers[0].output)
embeddings = get_activations(model, 0, X_voc)[0]

<keras.layers.embeddings.Embedding object at 0x7f0cdb3d2fd0>
Reshape{3}.0


In [1]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition
from sklearn import datasets

pca = decomposition.PCA(n_components=2)
pca.fit(embeddings[0])
V = pca.transform(embeddings[0])



NameError: name 'embeddings' is not defined

In [None]:

fig, ax = plt.subplots()
plt.plot(V[:,0], V[:,1], "o")
plt.axis([-1.2,1.2,-1.2,1.2])
for i in range(V.shape[0]):                                      
    ax.annotate(i, xy=V[i], textcoords='data')

plt.show()