# Test model

In [1]:
MODEL_NAME = "models/lstm_word2vec"
TEST_DATASET = "data/test_imdb.csv"

TOKENIZER_NAME = "models/lstm_word2vec_tokenizer"

---

In [2]:
import nltk
import numpy
import pickle
numpy.random.seed(42)

from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import model_from_json

Using Theano backend.


Swith on full text mode)

In [3]:
pd.options.display.max_colwidth = -1

## Load data

In [4]:
test_data = pd.read_csv(TEST_DATASET, sep="|")

## Preprocessing

In [5]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 70

In [6]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

negatives = {
    "didn't": "didn_`_t",
    "couldn't": "couldn_`_t",
    "can't": "can_`_t",
    "don't": "don_`_t",
    "wouldn't": "wouldn_`_t",
    "doesn't": "doesn_`_t",
    "wasn't": "wasn_`_t",
    "weren't": "weren_`_t",
    "shouldn't":"shouldn_`_t",
    "isn't": "isn_`_t",
    "aren't": "aren_`_t",
}

def preprocess(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    text = ' '.join(tweet_tokenizer.tokenize(text))
    for k, v in negatives.items():
        text = text.replace(k, v)
    return text

In [7]:
test_data['prep_text'] = test_data['text'].map(preprocess)

#### 1. String -> Int vector

In [8]:
 #load tokenizer
pkl_file = open(TOKENIZER_NAME, 'rb')
tokenizer = pickle.load(pkl_file)
pkl_file.close()

#### 2. Padding

In [9]:
sequences_test = tokenizer.texts_to_sequences(test_data['prep_text'])

padded_sequences_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

## Model

In [14]:
from keras import backend as K, initializations
from keras.layers import Layer
class AttentionLayer(Layer):
    '''
    Attention layer. 
    '''
    def __init__(self, init='glorot_uniform', **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
        self.supports_masking = True
        self.init = initializations.get(init)
        
    def build(self, input_shape):
        input_dim = input_shape[-1]
        self.Uw = self.init((input_dim, ))
        self.trainable_weights = [self.Uw]
        super(AttentionLayer, self).build(input_shape)  
    
    def compute_mask(self, input, mask):
        return mask
    
    def call(self, x, mask=None):
        multData =  K.exp(K.dot(x, self.Uw))
        if mask is not None:
            multData = mask*multData
        output = multData/(K.sum(multData, axis=1)+K.epsilon())[:,None]
        return K.reshape(output, (output.shape[0],output.shape[1],1))

    def get_output_shape_for(self, input_shape):
        newShape = list(input_shape)
        newShape[-1] = 1
        return tuple(newShape)

In [16]:
# load json and create model
json_file = open(MODEL_NAME + '.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json, {'AttentionLayer': AttentionLayer})

# load weights into new model
loaded_model.load_weights(MODEL_NAME + ".hdf5")
metrics=['accuracy', 'fmeasure', 'precision', 'recall']
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
print("Loaded '%s' model from disk" % MODEL_NAME)

Loaded 'models/lstm_word2vec' model from disk


In [17]:
loaded_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
words_input (InputLayer)         (None, 70)            0                                            
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 70, 300)       6000000     words_input[0][0]                
____________________________________________________________________________________________________
dropout_5 (Dropout)              (None, 70, 300)       0           embedding_4[0][0]                
____________________________________________________________________________________________________
lstm_3 (LSTM)                    (None, 70, 128)       219648      dropout_5[0][0]                  
___________________________________________________________________________________________

In [18]:
# evaluate the model
scores = loaded_model.evaluate(padded_sequences_test, test_data['label'], verbose=1)
print(' %s: %.2f%%' % (loaded_model.metrics_names[1], scores[1]*100))
print(' %s: %.2f%%' % (loaded_model.metrics_names[2], scores[2]*100))

 acc: 62.37%
 fmeasure: 56.52%


In [21]:
def padding(text):
    return pad_sequences(tokenizer.texts_to_sequences([preprocess(text)]), maxlen=MAX_SEQUENCE_LENGTH)
def lstm_predict(text):
    return loaded_model.predict(padding(preprocess(text)))[0][0]

In [22]:
inp = loaded_model.input                                           # input placeholder
outputs = [layer.output for layer in loaded_model.layers]          # all layer outputs
functor = K.function([inp]+ [K.learning_phase()], outputs ) # evaluation function

In [23]:
index_to_word = {index: word for word, index in tokenizer.word_index.items()}

In [24]:
def get_attention(text):
    rez = []
    word_indexes = padding(text)
    layer_outs = functor([word_indexes, 0.])
    attention_weights = [w[0] for w in layer_outs[-4][0]]
    for word_index, attention in zip(word_indexes[0], attention_weights):
        if word_index != 0:
            rez.append((index_to_word[word_index], attention))
    return rez

In [26]:
text = """

i don't think the movie good

"""
attention = get_attention(text)
print('Predicttion:', lstm_predict(text))
sorted(attention, key=lambda x: x[1], reverse=True)

Predicttion: 0.731442


[('don_`_t', 0.27763912),
 ('good', 0.21825622),
 ('think', 0.19776627),
 ('i', 0.18019289),
 ('the', 0.063442692),
 ('movie', 0.062702805)]