# Test model

In [27]:
MODEL_NAME = "models/lstm_attention"
TEST_DATASET = "data/test_imdb.csv"

TOKENIZER_NAME = "models/lstm_attention_tokenizer"

---

In [28]:
import nltk
import numpy
import pickle
numpy.random.seed(42)

from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import model_from_json

Swith on full text mode)

In [29]:
pd.options.display.max_colwidth = -1

## Load data

In [30]:
test_data = pd.read_csv(TEST_DATASET, sep="|")

## Preprocessing

In [31]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100

In [32]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

negatives = {
    "didn't": "didn_`_t",
    "couldn't": "couldn_`_t",
    "can't": "can_`_t",
    "don't": "don_`_t",
    "wouldn't": "wouldn_`_t",
    "doesn't": "doesn_`_t",
    "wasn't": "wasn_`_t",
    "weren't": "weren_`_t",
    "shouldn't":"shouldn_`_t",
    "isn't": "isn_`_t",
    "aren't": "aren_`_t",
}

def preprocess(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    text = ' '.join(tweet_tokenizer.tokenize(text))
    for k, v in negatives.items():
        text = text.replace(k, v)
    return text

In [33]:
test_data['prep_text'] = test_data['text'].map(preprocess)

#### 1. String -> Int vector

In [34]:
 #load tokenizer
pkl_file = open(TOKENIZER_NAME, 'rb')
tokenizer = pickle.load(pkl_file)
pkl_file.close()

#### 2. Padding

In [35]:
sequences_test = tokenizer.texts_to_sequences(test_data['prep_text'])

padded_sequences_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

## Model

In [36]:
from keras import backend as K, initializations
from keras.layers import Layer
class AttentionLayer(Layer):
    '''
    Attention layer. 
    '''
    def __init__(self, init='glorot_uniform', **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
        self.supports_masking = True
        self.init = initializations.get(init)
        
    def build(self, input_shape):
        input_dim = input_shape[-1]
        self.Uw = self.init((input_dim, ))
        self.trainable_weights = [self.Uw]
        super(AttentionLayer, self).build(input_shape)  
    
    def compute_mask(self, input, mask):
        return mask
    
    def call(self, x, mask=None):
        multData =  K.exp(K.dot(x, self.Uw))
        if mask is not None:
            multData = mask*multData
        output = multData/(K.sum(multData, axis=1)+K.epsilon())[:,None]
        return K.reshape(output, (output.shape[0],output.shape[1],1))

    def get_output_shape_for(self, input_shape):
        newShape = list(input_shape)
        newShape[-1] = 1
        return tuple(newShape)

In [37]:
# load json and create model
json_file = open(MODEL_NAME + '.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json, {'AttentionLayer': AttentionLayer})

# load weights into new model
loaded_model.load_weights(MODEL_NAME + ".hdf5")
metrics=['accuracy', 'fmeasure', 'precision', 'recall']
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
print("Loaded '%s' model from disk" % MODEL_NAME)

Loaded 'models/lstm_attention' model from disk


In [38]:
loaded_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
words_input (InputLayer)         (None, 100)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 100, 300)      6000000     words_input[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 100, 300)      0           embedding_1[0][0]                
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100, 128)      219648      dropout_1[0][0]                  
___________________________________________________________________________________________

In [39]:
# evaluate the model
scores = loaded_model.evaluate(padded_sequences_test, test_data['label'], verbose=1)
print(' %s: %.2f%%' % (loaded_model.metrics_names[1], scores[1]*100))
print(' %s: %.2f%%' % (loaded_model.metrics_names[2], scores[2]*100))

 acc: 91.64%
 fmeasure: 91.36%


In [40]:
def padding(text):
    return pad_sequences(tokenizer.texts_to_sequences([preprocess(text)]), maxlen=MAX_SEQUENCE_LENGTH)
def lstm_predict(text):
    return loaded_model.predict(padding(preprocess(text)))[0][0]

In [41]:
inp = loaded_model.input                                           # input placeholder
outputs = [layer.output for layer in loaded_model.layers]          # all layer outputs
functor = K.function([inp]+ [K.learning_phase()], outputs ) # evaluation function

In [42]:
index_to_word = {index: word for word, index in tokenizer.word_index.items()}

In [43]:
def get_attention(text):
    rez = []
    word_indexes = padding(text)
    layer_outs = functor([word_indexes, 0.])
    attention_weights = [w[0] for w in layer_outs[-4][0]]
    for word_index, attention in zip(word_indexes[0], attention_weights):
        if word_index != 0:
            rez.append((index_to_word[word_index], attention))
    return rez

In [65]:
text = """

Not that I want to be mean but this movie really surprised me a lot. 
During the whole film, I was like...erm...what is this movie all about?
I don't get the animations at all.
Probably this movie will only be suitable for those who belongs to the 1980s. 
During the film, there is a group of people walked out. 
After the movie, many people said, "That's it?" 
Frankly speaking, I cannot believe that this movie was awarded the best children film award. 
If you are thinking of watching this film, I strongly recommend you not to. 
You will regret it. 
I'm not joking. 
You will find that you are just wasting both your time and money of you go and watch it.

"""
attention = get_attention(text)
print('Predicttion:', lstm_predict(text))
sorted(attention, key=lambda x: x[1], reverse=True)

Predicttion: 0.10455


[('wasting', 0.11330451),
 ('money', 0.083290383),
 ('recommend', 0.074431881),
 ('time', 0.068038717),
 ('watch', 0.056578588),
 ('your', 0.055693366),
 ('it', 0.046308678),
 ('both', 0.039932225),
 ('go', 0.035543937),
 ('and', 0.029794395),
 ('of', 0.027411275),
 ('regret', 0.026645208),
 ('it', 0.024600193),
 ('and', 0.020526424),
 ('film', 0.019095136),
 ('watching', 0.018877855),
 ("i'm", 0.017901435),
 ('to', 0.017521612),
 ('you', 0.017066525),
 ('strongly', 0.015149771),
 ('award', 0.015087945),
 ('not', 0.013833558),
 ('you', 0.012877483),
 ('will', 0.011877784),
 ('you', 0.010480288),
 ('film', 0.010148853),
 ('this', 0.0098462021),
 ('not', 0.0094195763),
 ('thinking', 0.0082931491),
 ('of', 0.0082021374),
 ('i', 0.0068767862),
 ('find', 0.0064208643),
 ('if', 0.0058425488),
 ('just', 0.0057809521),
 ('joking', 0.0056338524),
 ('you', 0.005629878),
 ('you', 0.0053002806),
 ('will', 0.0046687615),
 ('that', 0.0038025293),
 ('you', 0.0037621895),
 ('best', 0.0032145926),
 ('c

In [82]:
text = """

This charming film (editing tricks aside), is rich in humor and period detail, and amazingly suspenseful considering we already know the outcome.

"""
attention = get_attention(text)
print('Predicttion:', lstm_predict(text))
sorted(attention, key=lambda x: x[1], reverse=True)

Predicttion: 0.999654


[('suspenseful', 0.38926497),
 ('we', 0.13195567),
 ('considering', 0.10877533),
 ('already', 0.087265417),
 ('know', 0.074578457),
 ('outcome', 0.041084044),
 ('the', 0.03928446),
 ('amazingly', 0.031950638),
 ('humor', 0.01694645),
 ('rich', 0.01563511),
 ('and', 0.011422137),
 ('and', 0.010519971),
 ('period', 0.010394074),
 ('in', 0.009793533),
 ('detail', 0.0065662442),
 ('is', 0.0044371397),
 ('aside', 0.0032916903),
 ('tricks', 0.0031648406),
 ('editing', 0.0017285636),
 ('film', 0.0010313196),
 ('charming', 0.00077899371),
 ('this', 5.6934809e-06)]