# Test model

In [1]:
MODEL_NAME = "models/lstm_word2vec"
TEST_DATASET = "data/test_imdb.csv"

TOKENIZER_NAME = "models/lstm_word2vec_tokenizer"

---

In [3]:
import nltk
import numpy
import pickle
numpy.random.seed(42)

nltk.data.path.insert(0, "/dvl/w3/wn/data/.nltk_data")

from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import model_from_json

Using TensorFlow backend.


Swith on full text mode)

In [4]:
pd.options.display.max_colwidth = -1

## Load data

In [5]:
test_data = pd.read_csv(TEST_DATASET, sep="|")

## Preprocessing

In [6]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 70

In [7]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

negatives = {
    "didn't": "didn_`_t",
    "couldn't": "couldn_`_t",
    "don't": "don_`_t",
    "wouldn't": "wouldn_`_t",
    "doesn't": "doesn_`_t",
    "wasn't": "wasn_`_t",
    "weren't": "weren_`_t",
    "shouldn't":"shouldn_`_t",
    "isn't": "isn_`_t",
    "aren't": "aren_`_t",
}

def preprocess(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    text = ' '.join(tweet_tokenizer.tokenize(text))
    for k, v in negatives.items():
        text = text.replace(k, v)
    return text

In [8]:
test_data['prep_text'] = test_data['text'].map(preprocess)

#### 1. String -> Int vector

In [9]:
 #load tokenizer
pkl_file = open(TOKENIZER_NAME, 'rb')
tokenizer = pickle.load(pkl_file)
pkl_file.close()

#### 2. Padding

In [10]:
sequences_test = tokenizer.texts_to_sequences(test_data['prep_text'])

padded_sequences_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

In [59]:
padded_sequences_test

array([[   11,     6,    87, ...,    55,    82,    17],
       [10361,   182,   175, ...,     4,     1,   320],
       [ 1375,     4,   492, ...,   149,   336,   556],
       ..., 
       [   44,    21,   201, ...,    13,   417,  5907],
       [  788, 19456,  4878, ...,     1,  6620,   181],
       [   15,    64,  3913, ...,   957,     4,  2144]], dtype=int32)

In [60]:
padded_sequences_test.shape

(10000, 70)

In [71]:
padded_sequences_test[:100]

array([[   11,     6,    87, ...,    55,    82,    17],
       [10361,   182,   175, ...,     4,     1,   320],
       [ 1375,     4,   492, ...,   149,   336,   556],
       ..., 
       [   11,     6,  1561, ...,     3,  1785,  4416],
       [   38,     2,  4315, ...,    11,   338,   177],
       [ 1825,    19,    96, ...,     3,     5,  1507]], dtype=int32)

In [88]:
padded_sequences_test.shape

(10000, 70)

In [61]:
padded_sequences_test[0]

array([   11,     6,    87,     8,     6,    19,     1,  2116,  5349,
        8931,    15,    44,    30,   953,    72,  9375,    97,    22,
          36,  8644,     3,    23, 12342,  1428,   812,    37,   225,
          26,    81,   129,  2892,    87,     5,   506,   102,    10,
          62,    25,  5862,    14,   159,   706,  1711,    98,    67,
          25,    52,    15,     1,   100,    26,     5,    25,  4387,
           3,    56,    24,   979,    11, 13373,     7,     9,    17,
           5,   285,     8,     2,    55,    82,    17], dtype=int32)

In [16]:
np.array([np.hstack(([0], padded_sequences_test[0][:3], padded_sequences_test[0][4:]))]).shape

(1, 70)

In [17]:
list(generate(padded_sequences_test[:1]))[0].shape

(70,)

In [27]:
index_to_word = {index: word for word, index in tokenizer.word_index.items()}

## Model

In [18]:
# load json and create model
json_file = open(MODEL_NAME + '.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights(MODEL_NAME + ".hdf5")
metrics=['accuracy', 'fmeasure', 'precision', 'recall']
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
print("Loaded '%s' model from disk" % MODEL_NAME)

Loaded 'models/lstm_word2vec' model from disk


In [19]:
loaded_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 70, 300)       6000000     embedding_input_1[0][0]          
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 256)           439296      embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 256)           0           bidirectional_1[0][0]            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1)             257         dropout_1[0][0]                  
Total params: 6,439,553
Trainable params: 439,553
Non-trainable params: 6,000,000
_________

In [20]:
def padding(text):
    return pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)

In [21]:
def lstm_predict(text):
    return loaded_model.predict(padding(preprocess(text)))[0][0]

In [22]:
lstm_predict("not bad")

0.65912336

In [130]:
lstm_predict("good")

0.91656721

In [116]:
def generate(sequence):
    rez = []
    for i in range(len(sequence)):
        rez.append(np.hstack(([0], sequence[:i], sequence[i+1:])))
    return np.array(rez)

In [215]:
def predict(sequences):
    return [p[0] for p in loaded_model.predict(sequences)]

In [93]:
def compare(sequences, seq_preds):
    word_scores = {}
    for si, seq in enumerate(sequences):
        
        hacked = generate(seq) # 70x70
        preds = predict(hacked)
        
        #70x1
        for i, word in enumerate(seq):
            
            seq_score = seq_preds[si]
            
            if word not in word_scores:
                word_scores[word] = []
            
            #word_scores[word].append((seq_score, preds[i], si))
            word_scores[word].append((seq_score - preds[i], si))
    
    return word_scores

In [128]:
def seq_to_words(sequence):
    result = []
    for word_idx in sequence:
        word = index_to_word.get(word_idx, "NA")
        result.append(word)
    return " ".join(result)

In [156]:
def inspect_sample(sample_id):
    sample = padded_sequences_test[sample_id]
    pred = predict(np.array([sample]))[0]
    hacked = generate(sample)
    hacked_preds = predict(hacked)
    result = []
    for i, word_idx in enumerate(sample):
        result.append([index_to_word.get(word_idx, "NA"), pred - hacked_preds[i]])
    return seq_to_words(sample), pred , result

In [229]:
def inspect_text(text):
    sample = padding(preprocess(text))[0]
    pred = predict(np.array([sample]))[0]
    hacked = generate(sample)
    hacked_preds = predict(hacked)
    result = []
    for i, word_idx in enumerate(sample):
        result.append([index_to_word.get(word_idx, "NA"), pred - hacked_preds[i]])
    return seq_to_words(sample), pred , result

In [132]:
word_scores = compare(padded_sequences_test[:1000], predict(padded_sequences_test[:1000]))

In [136]:
word_values = []
for word_index, scores in word_scores.items():
    s = sorted(scores)
    Min = s[0]
    Max = s[-1]
    word_values.append({
        'word': index_to_word.get(word_index, 'NAN'),
        'min': Min[0],
        'max': Max[0],
        'mean': np.mean(s, axis=0)[0],
        'value': sum([abs(si[0]) for si in s])/len(s),
        'count': len(s),
        'pos_count': len([s0 for s0 in s if s0[0] > 0]),
        'neg_count': len([s0 for s0 in s if s0[0] < 0]),
        'min_sample_id': Min[1],
        'max_sample_id': Max[1],
    })

In [137]:
rez = pd.DataFrame(word_values)

In [138]:
rez.shape

(7651, 10)

In [144]:
rez[rez['word'] == 'bad']

Unnamed: 0,count,max,max_sample_id,mean,min,min_sample_id,neg_count,pos_count,value,word
82,129,0.526629,213,-0.055462,-0.8364,0,102,27,0.067578,bad


In [211]:
rez[rez['value']>0.01][rez['pos_count'] > 10].sort_values(by=['max'], ascending=False)[:60]

  if __name__ == '__main__':


Unnamed: 0,count,max,max_sample_id,mean,min,min_sample_id,neg_count,pos_count,value,word
483,17,0.893484,424,0.086074,-0.005926,73,3,14,0.086981,brilliant
808,22,0.884046,815,0.131202,-0.016953,692,2,20,0.132895,7
43,229,0.751631,424,-0.001764,-0.337282,463,141,88,0.039866,just
79,158,0.688073,422,0.05554,-0.185119,795,30,128,0.064963,great
333,22,0.687929,683,0.051819,-0.067635,803,10,12,0.066961,entertaining
881,15,0.685247,850,0.104556,-0.03917,331,2,13,0.109779,9
818,19,0.642763,542,0.067622,-0.02987,832,3,16,0.070965,8
499,33,0.614869,526,0.036665,-0.281842,795,14,19,0.07139,5
261,62,0.609025,619,0.013216,-0.474269,718,33,29,0.068604,worth
21,601,0.5848,213,0.002506,-0.333722,445,273,327,0.033289,you


In [210]:
rez[rez['value']>0.05][rez['neg_count'] > 10].sort_values(by=['min'], ascending=True)[:60]

  if __name__ == '__main__':


Unnamed: 0,count,max,max_sample_id,mean,min,min_sample_id,neg_count,pos_count,value,word
202,32,0.3703342,852,-0.04773,-0.892527,750,22,10,0.091175,isn_`_t
82,129,0.5266286,213,-0.055462,-0.8364,0,102,27,0.067578,bad
550,24,0.008945107,662,-0.055339,-0.829646,230,19,5,0.056177,horrible
20,449,0.4586562,206,-0.018416,-0.820739,670,256,193,0.06144,not
1098,16,0.0008026576,365,-0.101393,-0.782653,138,15,1,0.101493,wasted
489,34,6.771693e-05,342,-0.088999,-0.754009,611,32,2,0.089003,waste
253,25,0.008414049,92,-0.101492,-0.750518,490,24,1,0.102165,worst
510,38,0.3218423,526,-0.080531,-0.724063,879,27,11,0.137892,4
388,32,0.03797973,803,-0.051805,-0.722428,997,20,12,0.055736,stupid
203,55,0.1884968,95,-0.018657,-0.698737,177,30,25,0.058428,without


In [212]:
inspect_sample(213)

("respective marketing campaign the only one minor drawback about the cure for me is its short length only 97 minutes i don_`_t want to write more about the movie because it's simply impossible to put its beauty and sincerity into words so if you have any opportunity for watching the cure rent it or buy it and you wouldn_`_t be disappointed 10 out of 10 sorry for my bad english",
 0.72480643,
 [['respective', 0.28351146],
  ['marketing', 0.40891713],
  ['campaign', 0.53881943],
  ['the', 0.4809525],
  ['only', 0.54645371],
  ['one', 0.55709779],
  ['minor', 0.61442029],
  ['drawback', 0.61928368],
  ['about', 0.55093867],
  ['the', 0.53439057],
  ['cure', 0.43145785],
  ['for', 0.52827203],
  ['me', 0.49803722],
  ['is', 0.53499007],
  ['its', 0.53831023],
  ['short', 0.51344311],
  ['length', 0.52338743],
  ['only', 0.53354895],
  ['97', 0.51407772],
  ['minutes', 0.46582511],
  ['i', 0.49801889],
  ['don_`_t', 0.53662819],
  ['want', 0.5320974],
  ['to', 0.52469492],
  ['write', 0.50

In [249]:
inspect_text("respective marketing campaign the only one minor drawback about the cure for me is its short length only 97 minutes i don't want to write more about the movie because it's simply impossible to put its beauty and sincerity into words so if you have any opportunity for watching the cure rent it or buy it and you wouldn't be disappointed 10 out of 10 sorry for my bad english")

("respective marketing campaign the only one minor drawback about the cure for me is its short length only 97 minutes i don_`_t want to write more about the movie because it's simply impossible to put its beauty and sincerity into words so if you have any opportunity for watching the cure rent it or buy it and you wouldn_`_t be disappointed 10 out of 10 sorry for my bad english",
 0.72480643,
 [['respective', 0.28351146],
  ['marketing', 0.40891713],
  ['campaign', 0.53881943],
  ['the', 0.4809525],
  ['only', 0.54645371],
  ['one', 0.55709779],
  ['minor', 0.61442029],
  ['drawback', 0.61928368],
  ['about', 0.55093867],
  ['the', 0.53439057],
  ['cure', 0.43145785],
  ['for', 0.52827203],
  ['me', 0.49803722],
  ['is', 0.53499007],
  ['its', 0.53831023],
  ['short', 0.51344311],
  ['length', 0.52338743],
  ['only', 0.53354895],
  ['97', 0.51407772],
  ['minutes', 0.46582511],
  ['i', 0.49801889],
  ['don_`_t', 0.53662819],
  ['want', 0.5320974],
  ['to', 0.52469492],
  ['write', 0.50