# LSTM IMDB Movie Review Tutorial
Josiah Olson

In [134]:
from __future__ import print_function
import time
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge, BatchNormalization,GRU
from keras.datasets import imdb

import os
from keras.preprocessing.text import Tokenizer

In [135]:
max_features =10000
max_len = 50  # cut texts after this number of words (among top max_features most common words)

In [136]:
# get dataset and unzip: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

X_train = []
y_train = []

path = 'C:/1_Research/Create_data/aclImdb/train_sukien55/pos/'
X_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_train.extend([1 for _ in range(1499)])

path = 'C:/1_Research/Create_data/aclImdb/train_sukien55/neg/'
X_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_train.extend([0 for _ in range(587)])

print('x:')
print(X_train[:1])
print(X_train[-1:])
print(len(X_train))
print('y:')
print(y_train[:1])
print(y_train[-1:])
print(len(y_train))

x:
['\xef\xbb\xbfABI: Ng\xc3\xa0y GDKHQ T\xe1\xbb\x95 ch\xe1\xbb\xa9c \xc4\x90\xe1\xba\xa1i h\xe1\xbb\x99i \xc4\x91\xe1\xbb\x93ng c\xe1\xbb\x95 \xc4\x91\xc3\xb4ng th\xc6\xb0\xe1\xbb\x9dng ni\xc3\xaan n\xc4\x83m 2014 v\xc3\xa0 T\xe1\xba\xa1m \xe1\xbb\xa9ng c\xe1\xbb\x95 t\xe1\xbb\xa9c n\xc4\x83m 2013 b\xe1\xba\xb1ng ti\xe1\xbb\x81n (10%)\n']
['\xef\xbb\xbfNPS: Ng\xc3\xa0y GDKHQ \xc4\x90\xe1\xba\xa1i h\xe1\xbb\x99i c\xe1\xbb\x95 \xc4\x91\xc3\xb4ng th\xc6\xb0\xe1\xbb\x9dng ni\xc3\xaan n\xc4\x83m 2014\n']
2086
y:
[1]
[0]
2086


In [137]:
# read in the test data

X_test = []
y_test = []

path = 'C:/1_Research/Create_data/aclImdb/test_sukien55/pos/'
X_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_test.extend([1 for _ in range(1498)])

path = 'C:/1_Research/Create_data/aclImdb/test_sukien55/neg/'
X_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_test.extend([0 for _ in range(586)])

print('x:')
print(X_test[:1])
print(X_test[-1:])
print(len(X_test))
print('y:')
print(y_test[:1])
print(y_test[-1:])
print(len(y_test))

x:
['\xef\xbb\xbfLCC: Ng\xc3\xa0y  GDKHQ T\xe1\xbb\x95 ch\xe1\xbb\xa9c \xc4\x90\xe1\xba\xa1i h\xe1\xbb\x99i \xc4\x91\xe1\xbb\x93ng c\xe1\xbb\x95 \xc4\x91\xc3\xb4ng n\xc4\x83m 2014\n']
['\xef\xbb\xbfYBC: Ng\xc3\xa0y GDKHQ tham d\xe1\xbb\xb1 \xc4\x90\xe1\xba\xa1i h\xe1\xbb\x99i \xc4\x91\xe1\xbb\x93ng c\xe1\xbb\x95 \xc4\x91\xc3\xb4ng th\xc6\xb0\xe1\xbb\x9dng ni\xc3\xaan 2016\n']
2084
y:
[1]
[0]
2084


In [138]:
#tokenize works to list of integers where each integer is a key to a word
imdbTokenizer = Tokenizer(nb_words=max_features)

imdbTokenizer.fit_on_texts(X_train)

In [139]:
#print top 20 words 
#note zero is reserved for non frequent words
for word, value in imdbTokenizer.word_index.items():
    if value < 20:
        print(value, word)

5 đông
3 gdkhq
8 Đại
4 năm
19 dự
12 tức
15 2015
13 2014
1 cổ
6 thường
7 hội
2 ngày
10 đồng
16 chức
18 trả
17 tổ
14 tiền
11 bằng
9 niên


In [140]:
#create int to word dictionary
intToWord = {}
for word, value in imdbTokenizer.word_index.items():
    intToWord[value] = word

#add a symbol for null placeholder
intToWord[0] = "!!!NA!!!"
    
print(intToWord[1])
print(intToWord[2])
print(intToWord[32])

cổ
ngày
10


In [141]:
#convert word strings to integer sequence lists
print(X_train[0])
print(imdbTokenizer.texts_to_sequences(X_train[:1]))
for value in imdbTokenizer.texts_to_sequences(X_train[:1])[0]:
    print(intToWord[value])
    
X_train = imdbTokenizer.texts_to_sequences(X_train)
X_test = imdbTokenizer.texts_to_sequences(X_test)

﻿ABI: Ngày GDKHQ Tổ chức Đại hội đồng cổ đông thường niên năm 2014 và Tạm ứng cổ tức năm 2013 bằng tiền (10%)

[[578, 2, 3, 17, 16, 8, 7, 10, 1, 5, 6, 9, 4, 13, 27, 24, 23, 1, 12, 4, 26, 11, 14, 32]]
﻿abi
ngày
gdkhq
tổ
chức
Đại
hội
đồng
cổ
đông
thường
niên
năm
2014
và
tạm
ứng
cổ
tức
năm
2013
bằng
tiền
10


In [142]:
# Censor the data by having a max review length (in number of words)

#use this function to load data from keras pickle instead of munging as shown above
#(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
#                                                      test_split=0.2)

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

2086 train sequences
2084 test sequences
Pad sequences (samples x time)
X_train shape: (2086L, 50L)
X_test shape: (2084L, 50L)


In [143]:
#example of a sentence sequence, note that lower integers are words that occur more commonly
print("x:", X_train[0]) #per observation vector of 20000 words
print("y:", y_train[0]) #positive or negative review encoding

x: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0 578   2   3  17  16   8   7  10   1   5
   6   9   4  13  27  24  23   1  12   4  26  11  14  32]
y: 1


In [144]:
# double check that word sequences behave/final dimensions are as expected
print("y distribution:", np.unique(y_train, return_counts=True))
print("max x word:", np.max(X_train), "; min x word", np.min(X_train))
print("y distribution test:", np.unique(y_test, return_counts=True))
print("max x word test:", np.max(X_test), "; min x word", np.min(X_test))

y distribution: (array([0, 1]), array([ 587, 1499], dtype=int64))
max x word: 895 ; min x word 0
y distribution test: (array([0, 1]), array([ 586, 1498], dtype=int64))
max x word test: 866 ; min x word 0


In [145]:
print("most and least popular words: ")
print(np.unique(X_train, return_counts=True))
# as expected zero is the highly used word for words not in index

most and least popular words: 
(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 

In [146]:
#set model hyper parameters
epochs = 50
embedding_neurons = 500
lstm_neurons = 100
batch_size =100

In [147]:
# Forward Pass LSTM Network

# this is the placeholder tensor for the input sequences
sequence = Input(shape=(max_len,), dtype='int32')
# this embedding layer will transform the sequences of integers
# into vectors of size embedding
# embedding layer converts dense int input to one-hot in real time to save memory
embedded = Embedding(max_features, embedding_neurons, input_length=max_len)(sequence)
# normalize embeddings by input/word in sentence
bnorm = BatchNormalization()(embedded)

# apply forwards LSTM layer size lstm_neurons
forwards = LSTM(lstm_neurons, dropout_W=0.2, dropout_U=0.2)(bnorm)

# dropout 
after_dp = Dropout(0.5)(forwards)
output = Dense(1, activation='sigmoid')(after_dp)

model_fdir_atom = Model(input=sequence, output=output)
# review model structure
print(model_fdir_atom.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_13 (InputLayer)            (None, 50)            0                                            
____________________________________________________________________________________________________
embedding_13 (Embedding)         (None, 50, 500)       5000000     input_13[0][0]                   
____________________________________________________________________________________________________
batchnormalization_13 (BatchNorma(None, 50, 500)       1000        embedding_13[0][0]               
____________________________________________________________________________________________________
lstm_10 (LSTM)                   (None, 100)           240400      batchnormalization_13[0][0]      
___________________________________________________________________________________________

In [148]:
# Forward pass LSTM network

# try using different optimizers and different optimizer configs
model_fdir_atom.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
start_time = time.time()

history_fdir_atom = model_fdir_atom.fit(X_train, y_train,
                    batch_size=batch_size,
                    nb_epoch=epochs,
                    validation_data=[X_test, y_test], 
                    verbose=2)

end_time = time.time()
average_time_per_epoch = (end_time - start_time) / epochs
print("avg sec per epoch:", average_time_per_epoch)

Train...
Train on 2086 samples, validate on 2084 samples
Epoch 1/50
23s - loss: 0.6537 - acc: 0.6903 - val_loss: 0.5980 - val_acc: 0.7188
Epoch 2/50
24s - loss: 0.5957 - acc: 0.7215 - val_loss: 0.6082 - val_acc: 0.7145
Epoch 3/50
23s - loss: 0.5884 - acc: 0.7210 - val_loss: 0.6085 - val_acc: 0.7145
Epoch 4/50
24s - loss: 0.5883 - acc: 0.7196 - val_loss: 0.6038 - val_acc: 0.7135
Epoch 5/50
24s - loss: 0.5666 - acc: 0.7315 - val_loss: 0.6113 - val_acc: 0.7087
Epoch 6/50
24s - loss: 0.5298 - acc: 0.7464 - val_loss: 0.6316 - val_acc: 0.7063
Epoch 7/50
26s - loss: 0.4881 - acc: 0.7733 - val_loss: 0.6343 - val_acc: 0.7063
Epoch 8/50
28s - loss: 0.4416 - acc: 0.8054 - val_loss: 0.6700 - val_acc: 0.6166
Epoch 9/50
24s - loss: 0.3766 - acc: 0.8418 - val_loss: 0.7345 - val_acc: 0.4328
Epoch 10/50
25s - loss: 0.3181 - acc: 0.8663 - val_loss: 0.7543 - val_acc: 0.4434
Epoch 11/50
25s - loss: 0.2889 - acc: 0.8792 - val_loss: 0.7389 - val_acc: 0.5547
Epoch 12/50
26s - loss: 0.2782 - acc: 0.8840 - val

In [149]:
# Bi-directional Atom

# based on keras tutorial: https://github.com/fchollet/keras/blob/master/examples/imdb_bidirectional_lstm.py

# this is the placeholder tensor for the input sequences
sequence = Input(shape=(max_len,), dtype='int32')
# this embedding layer will transform the sequences of integers
# into vectors of size embedding
# embedding layer converts dense int input to one-hot in real time to save memory
embedded = Embedding(max_features, embedding_neurons, input_length=max_len)(sequence)
# normalize embeddings by input/word in sentence
bnorm = BatchNormalization()(embedded)

# apply forwards LSTM layer size lstm_neurons
forwards = GRU(lstm_neurons, dropout_W=0.4, dropout_U=0.4)(bnorm)
# apply backwards LSTM
backwards = GRU(lstm_neurons, dropout_W=0.4, dropout_U=0.4, go_backwards=True)(bnorm)

# concatenate the outputs of the 2 LSTMs
merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
after_dp = Dropout(0.5)(merged)
output = Dense(1, activation='sigmoid')(after_dp)

model_bidir_atom = Model(input=sequence, output=output)
# review model structure
print(model_bidir_atom.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_14 (InputLayer)            (None, 50)            0                                            
____________________________________________________________________________________________________
embedding_14 (Embedding)         (None, 50, 500)       5000000     input_14[0][0]                   
____________________________________________________________________________________________________
batchnormalization_14 (BatchNorma(None, 50, 500)       1000        embedding_14[0][0]               
____________________________________________________________________________________________________
gru_7 (GRU)                      (None, 100)           180300      batchnormalization_14[0][0]      
___________________________________________________________________________________________

In [150]:
# Bi-directional Atom

# try using different optimizers and different optimizer configs
model_bidir_atom.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
start_time = time.time()

history_bidir_atom = model_bidir_atom.fit(X_train, y_train,
                    batch_size=batch_size,
                    nb_epoch=epochs,
                    validation_data=[X_test, y_test], 
                    verbose=2)

end_time = time.time()
average_time_per_epoch = (end_time - start_time) / epochs
print("avg sec per epoch:", average_time_per_epoch)

Train...
Train on 2086 samples, validate on 2084 samples
Epoch 1/50
24s - loss: 0.7475 - acc: 0.6496 - val_loss: 0.6160 - val_acc: 0.7179
Epoch 2/50
24s - loss: 0.6852 - acc: 0.6716 - val_loss: 0.6185 - val_acc: 0.7116
Epoch 3/50
24s - loss: 0.6529 - acc: 0.6779 - val_loss: 0.6116 - val_acc: 0.7073
Epoch 4/50
24s - loss: 0.5700 - acc: 0.7258 - val_loss: 0.7040 - val_acc: 0.7164
Epoch 5/50
25s - loss: 0.4870 - acc: 0.7723 - val_loss: 0.6169 - val_acc: 0.7039
Epoch 6/50
24s - loss: 0.4164 - acc: 0.8154 - val_loss: 0.6212 - val_acc: 0.7102
Epoch 7/50
25s - loss: 0.3553 - acc: 0.8452 - val_loss: 0.6796 - val_acc: 0.7150
Epoch 8/50
25s - loss: 0.3265 - acc: 0.8615 - val_loss: 0.7790 - val_acc: 0.7169
Epoch 9/50
25s - loss: 0.3144 - acc: 0.8663 - val_loss: 0.7623 - val_acc: 0.7155
Epoch 10/50
25s - loss: 0.2865 - acc: 0.8754 - val_loss: 0.7952 - val_acc: 0.7145
Epoch 11/50
24s - loss: 0.2938 - acc: 0.8682 - val_loss: 0.7378 - val_acc: 0.7121
Epoch 12/50
24s - loss: 0.2746 - acc: 0.8763 - val