# Test model

In [12]:
MODEL_NAME = "models/lstm_word2vec"
TEST_DATASET = "data/test_imdb.csv"

TOKENIZER_NAME = "models/lstm_word2vec_tokenizer"

---

In [2]:
import nltk
import numpy
import pickle
numpy.random.seed(42)

from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import model_from_json

Using TensorFlow backend.


Swith on full text mode)

In [3]:
pd.options.display.max_colwidth = -1

## Load data

In [4]:
test_data = pd.read_csv(TEST_DATASET, sep="|")

## Preprocessing

In [5]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 70

In [6]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

negatives = {
    "didn't": "didn_`_t",
    "couldn't": "couldn_`_t",
    "don't": "don_`_t",
    "wouldn't": "wouldn_`_t",
    "doesn't": "doesn_`_t",
    "wasn't": "wasn_`_t",
    "weren't": "weren_`_t",
    "shouldn't":"shouldn_`_t",
    "isn't": "isn_`_t",
    "aren't": "aren_`_t",
}

def preprocess(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    text = ' '.join(tweet_tokenizer.tokenize(text))
    for k, v in negatives.items():
        text = text.replace(k, v)
    return text

In [7]:
test_data['prep_text'] = test_data['text'].map(preprocess)

#### 1. String -> Int vector

In [8]:
 #load tokenizer
pkl_file = open(TOKENIZER_NAME, 'rb')
tokenizer = pickle.load(pkl_file)
pkl_file.close()

#### 2. Padding

In [9]:
sequences_test = tokenizer.texts_to_sequences(test_data['prep_text'])

padded_sequences_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

In [59]:
padded_sequences_test

array([[   11,     6,    87, ...,    55,    82,    17],
       [10361,   182,   175, ...,     4,     1,   320],
       [ 1375,     4,   492, ...,   149,   336,   556],
       ..., 
       [   44,    21,   201, ...,    13,   417,  5907],
       [  788, 19456,  4878, ...,     1,  6620,   181],
       [   15,    64,  3913, ...,   957,     4,  2144]], dtype=int32)

In [60]:
padded_sequences_test.shape

(10000, 70)

In [71]:
padded_sequences_test[:100]

array([[   11,     6,    87, ...,    55,    82,    17],
       [10361,   182,   175, ...,     4,     1,   320],
       [ 1375,     4,   492, ...,   149,   336,   556],
       ..., 
       [   11,     6,  1561, ...,     3,  1785,  4416],
       [   38,     2,  4315, ...,    11,   338,   177],
       [ 1825,    19,    96, ...,     3,     5,  1507]], dtype=int32)

In [88]:
padded_sequences_test.shape

(10000, 70)

In [61]:
padded_sequences_test[0]

array([   11,     6,    87,     8,     6,    19,     1,  2116,  5349,
        8931,    15,    44,    30,   953,    72,  9375,    97,    22,
          36,  8644,     3,    23, 12342,  1428,   812,    37,   225,
          26,    81,   129,  2892,    87,     5,   506,   102,    10,
          62,    25,  5862,    14,   159,   706,  1711,    98,    67,
          25,    52,    15,     1,   100,    26,     5,    25,  4387,
           3,    56,    24,   979,    11, 13373,     7,     9,    17,
           5,   285,     8,     2,    55,    82,    17], dtype=int32)

In [91]:
np.array([np.hstack(([0], padded_sequences_test[0][:3], padded_sequences_test[0][4:]))]).shape

(1, 70)

In [99]:
list(generate(padded_sequences_test[:1]))[0].shape

(70, 70)

## Model

In [13]:
# load json and create model
json_file = open(MODEL_NAME + '.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights(MODEL_NAME + ".hdf5")
metrics=['accuracy', 'fmeasure', 'precision', 'recall']
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
print("Loaded '%s' model from disk" % MODEL_NAME)

Loaded 'models/lstm_word2vec' model from disk


In [14]:
loaded_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 70, 300)       6000000     embedding_input_2[0][0]          
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 256)           439296      embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 256)           0           bidirectional_1[0][0]            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1)             257         dropout_1[0][0]                  
Total params: 6,439,553
Trainable params: 439,553
Non-trainable params: 6,000,000
_________

In [204]:
def padding(text):
    return pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)

In [205]:
def lstm_predict(text):
    return loaded_model.predict(padding(preprocess(text)))[0][0]

In [228]:
lstm_predict("not bad")

0.65912354

In [217]:
lstm_predict("good")

0.91656721

In [146]:
def generate(sequence):
    rez = []
    for i in range(len(sequence)):
        rez.append(np.hstack(([0], padded_sequences_test[0][:i], padded_sequences_test[0][i+1:])))
    return np.array(rez)

In [154]:
def predict(hacked):
    return [p[0] for p in loaded_model.predict(hacked)]

In [140]:
def compare(sequences, seq_preds):
    word_scores = {}
    for si, seq in enumerate(sequences):
        
        hacked = generate(seq) # 69x70
        preds = predict(hacked)
        
        #70x1
        for i, word in enumerate(seq):
            
            seq_score = seq_preds[si]
            
            if word not in word_scores:
                word_scores[word] = []
            
            word_scores[word].append(seq_score - preds[i])
    return word_scores

In [161]:
index_to_word = {index: word for word, index in tokenizer.word_index.items()}

In [201]:
word_scores = compare(padded_sequences_test, predict(padded_sequences_test))

In [202]:
word_values = []
for word_index, scores in word_scores.items():
    s = sorted(scores)
    word_values.append({
        'word': index_to_word.get(word_index, 'NAN'),
        'min': s[0],
        'max': s[-1],
        'mean': np.mean(s),
        'count': len(s),
        'pos_count': len([s0 for s0 in s if s0 > 0]),
        'neg_count': len([s0 for s0 in s if s0 < 0]),
    })

In [206]:
rez = pd.DataFrame(word_values)

In [207]:
rez.shape

(17629, 7)

In [208]:
rez[rez['word'] == 'bad']

Unnamed: 0,count,max,mean,min,neg_count,pos_count,word
82,1389,0.99489,0.165815,-0.839681,586,803,bad


In [215]:
rez[rez['mean'] > 0.9][rez['pos_count'] > 3]

  if __name__ == '__main__':


Unnamed: 0,count,max,mean,min,neg_count,pos_count,word
1565,40,0.995934,0.942692,-0.005341,1,39,wonderfully
1697,38,0.995151,0.929684,0.159606,0,38,delightful
2266,25,0.993663,0.907994,0.033720,0,25,refreshing
2315,20,0.994003,0.953459,0.651560,0,20,ensemble
2864,15,0.996041,0.906866,0.407500,0,15,courage
2937,16,0.993790,0.910703,0.398391,0,16,mickey
3005,19,0.994245,0.925410,0.199421,0,19,musicals
3023,8,0.994207,0.985540,0.940957,0,8,riveting
3059,14,0.994041,0.960329,0.699155,0,14,superbly
3069,25,0.993456,0.918839,0.155415,0,25,daily
