In [79]:
'''
Trains a Bidirectional LSTM on the IMDB sentiment classification task.

Output after 4 epochs on CPU: ~0.8146
Time per epoch on CPU (Core i7): ~150s.
'''

import numpy as np

from keras.preprocessing import sequence
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Conv1D, MaxPool1D, Flatten
from keras.datasets import imdb

In [36]:
max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [17]:
dd=imdb.get_word_index()

In [21]:
ddv=dict((v,k) for (k,v) in dd.iteritems())

In [32]:
print(sequence)

<module 'keras.preprocessing.sequence' from '/usr/local/lib/python2.7/site-packages/keras/preprocessing/sequence.pyc'>


In [37]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

Pad sequences (samples x time)
Before [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
After [ 14

In [38]:
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

In [39]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

x_train shape: (25000, 100)
x_test shape: (25000, 100)


In [41]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [42]:
Embedding

keras.layers.embeddings.Embedding

### Bidirectional LSTM

In [46]:
model = Sequential()
# Max_features: vocabulary size
# 128= Length of vector space in which each word is embedded
# maxlen = Length of each document (sentence)
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_test, y_test])

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x135de4ed0>

In [48]:
y_pred=model.predict_classes(x_test)

In [60]:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.6197509709310531
Test accuracy: 0.8338


In [84]:
model2 = Sequential()
model2.add(Embedding(max_features, 128, input_length=maxlen))
print(model2.input_shape)
print(model2.output_shape)
model2.add(Conv1D(20,
                 5,
                 padding='valid',
                 activation='relu',
                 strides=1))
print(model2.output_shape)
model2.add(MaxPool1D(pool_size=4))
print(model2.output_shape)
#model2.add(LSTM(20))
#print(model2.output_shape)
model2.add(LSTM(20))
model2.add(Dense(1, activation='sigmoid'))
print(model2.output_shape)

# try using different optimizers and different optimizer configs
model2.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model2.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_test, y_test])

(None, 100)
(None, 100, 128)
(None, 96, 20)
(None, 24, 20)
(None, 1)
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x12dd1bfd0>

In [85]:
score, acc = model2.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.6511905290961265
Test accuracy: 0.83524
