In [1]:
import numpy as np

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.utils.data_utils import pad_sequences

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


# fix random seed for reproducibility
np.random.seed(7)

In [2]:
print('only use the 1000 most frequent words')

TOP_WORDS=1000 # only use the 1000 most frequent words
INDEX_FROM=3   # word index offset

train,test = imdb.load_data(num_words=TOP_WORDS, index_from=INDEX_FROM)
X_train,y_train = train
X_test,y_test = test

word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}


   

only use the 1000 most frequent words


In [3]:
print('examples on mapping word and id')
print('id-----word: 4 -----'+id_to_word[4])
print('word-----id: casting -----'+str(word_to_id['casting']))

examples on mapping word and id
id-----word: 4 -----the
word-----id: casting -----973


In [4]:
max_review_length = 128
X_train = pad_sequences(X_train, maxlen=max_review_length)
X_test = pad_sequences(X_test, maxlen=max_review_length)

In [5]:
# create the model
embedding_vecor_length = 32
num_LSTM = 128

model = Sequential()
model.add(Embedding(TOP_WORDS, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(num_LSTM))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=2, batch_size=256,
         validation_split = 0.2)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 32)           32000     
                                                                 
 lstm (LSTM)                 (None, 128)               82432     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 114,561
Trainable params: 114,561
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x28c74fdc0>

In [6]:
# Evaluate
score = model.evaluate(X_test, y_test, verbose=0)
print('Summary: Loss over the test dataset: %.2f, Accuracy: %.2f' % (score[0], score[1]))

Summary: Loss over the test dataset: 0.38, Accuracy: 0.83


In [7]:
def printing_lstm_pred_prob(X):
    positive_prob = model.predict(np.reshape(X, (1, -1)), batch_size=1)
    print('-------- LSTM prediction: probability of being positive:  '+str(np.round(positive_prob[0,0]*100))+'%')
def printing_id2word(X):
    print(' '.join(id_to_word[id] for id in X))
     

In [8]:
print('sentiment analysis on IMDB movie reviews')
for i in range(10):
    print('*')
    printing_id2word(X_test[i])
    printing_lstm_pred_prob(X_test[i])
    print('-------- ground truth:'+str(y_test[i]) )    
    
    

sentiment analysis on IMDB movie reviews
*
<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <START> please give this one a miss br br <UNK> <UNK> and the rest of the cast <UNK> terrible performances the show is <UNK> <UNK> <UNK> br br i don't know how michael <UNK> could have <UNK> this one on his <UNK> he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you <UNK> fans give this a miss
-------- LSTM prediction: probability of being positive:  41.0%
-------- ground truth:0
*
young man <UNK> <UNK> <UNK> michael <UNK> has a small part the <UNK> <UNK> set <UNK> the <UNK> of the story very well in short this movie is a powerful <UNK> of <UN