# Test model

In [1]:
MODEL_NAME = "models/lstm_word2vec"
TEST_DATASET = "data/test_imdb.csv"

TOKENIZER_NAME = "models/lstm_word2vec_tokenizer"

---

In [3]:
import nltk
import numpy
import pickle
numpy.random.seed(42)

nltk.data.path.insert(0, "/dvl/w3/wn/data/.nltk_data")

from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import model_from_json

Using TensorFlow backend.


Swith on full text mode)

In [4]:
pd.options.display.max_colwidth = -1

## Load data

In [5]:
test_data = pd.read_csv(TEST_DATASET, sep="|")

## Preprocessing

In [6]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 70

In [7]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

negatives = {
    "didn't": "didn_`_t",
    "couldn't": "couldn_`_t",
    "don't": "don_`_t",
    "wouldn't": "wouldn_`_t",
    "doesn't": "doesn_`_t",
    "wasn't": "wasn_`_t",
    "weren't": "weren_`_t",
    "shouldn't":"shouldn_`_t",
    "isn't": "isn_`_t",
    "aren't": "aren_`_t",
}

def preprocess(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    text = ' '.join(tweet_tokenizer.tokenize(text))
    for k, v in negatives.items():
        text = text.replace(k, v)
    return text

In [8]:
test_data['prep_text'] = test_data['text'].map(preprocess)

#### 1. String -> Int vector

In [9]:
 #load tokenizer
pkl_file = open(TOKENIZER_NAME, 'rb')
tokenizer = pickle.load(pkl_file)
pkl_file.close()

#### 2. Padding

In [10]:
sequences_test = tokenizer.texts_to_sequences(test_data['prep_text'])

padded_sequences_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

In [59]:
padded_sequences_test

array([[   11,     6,    87, ...,    55,    82,    17],
       [10361,   182,   175, ...,     4,     1,   320],
       [ 1375,     4,   492, ...,   149,   336,   556],
       ..., 
       [   44,    21,   201, ...,    13,   417,  5907],
       [  788, 19456,  4878, ...,     1,  6620,   181],
       [   15,    64,  3913, ...,   957,     4,  2144]], dtype=int32)

In [60]:
padded_sequences_test.shape

(10000, 70)

In [71]:
padded_sequences_test[:100]

array([[   11,     6,    87, ...,    55,    82,    17],
       [10361,   182,   175, ...,     4,     1,   320],
       [ 1375,     4,   492, ...,   149,   336,   556],
       ..., 
       [   11,     6,  1561, ...,     3,  1785,  4416],
       [   38,     2,  4315, ...,    11,   338,   177],
       [ 1825,    19,    96, ...,     3,     5,  1507]], dtype=int32)

In [88]:
padded_sequences_test.shape

(10000, 70)

In [61]:
padded_sequences_test[0]

array([   11,     6,    87,     8,     6,    19,     1,  2116,  5349,
        8931,    15,    44,    30,   953,    72,  9375,    97,    22,
          36,  8644,     3,    23, 12342,  1428,   812,    37,   225,
          26,    81,   129,  2892,    87,     5,   506,   102,    10,
          62,    25,  5862,    14,   159,   706,  1711,    98,    67,
          25,    52,    15,     1,   100,    26,     5,    25,  4387,
           3,    56,    24,   979,    11, 13373,     7,     9,    17,
           5,   285,     8,     2,    55,    82,    17], dtype=int32)

In [16]:
np.array([np.hstack(([0], padded_sequences_test[0][:3], padded_sequences_test[0][4:]))]).shape

(1, 70)

In [17]:
list(generate(padded_sequences_test[:1]))[0].shape

(70,)

In [27]:
index_to_word = {index: word for word, index in tokenizer.word_index.items()}

## Model

In [18]:
# load json and create model
json_file = open(MODEL_NAME + '.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights(MODEL_NAME + ".hdf5")
metrics=['accuracy', 'fmeasure', 'precision', 'recall']
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
print("Loaded '%s' model from disk" % MODEL_NAME)

Loaded 'models/lstm_word2vec' model from disk


In [19]:
loaded_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 70, 300)       6000000     embedding_input_1[0][0]          
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 256)           439296      embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 256)           0           bidirectional_1[0][0]            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1)             257         dropout_1[0][0]                  
Total params: 6,439,553
Trainable params: 439,553
Non-trainable params: 6,000,000
_________

In [20]:
def padding(text):
    return pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)

In [21]:
def lstm_predict(text):
    return loaded_model.predict(padding(preprocess(text)))[0][0]

In [22]:
lstm_predict("not bad")

0.65912336

In [130]:
lstm_predict("good")

0.91656721

In [116]:
def generate(sequence):
    rez = []
    for i in range(len(sequence)):
        rez.append(np.hstack(([0], sequence[:i], sequence[i+1:])))
    return np.array(rez)

In [25]:
def predict(hacked):
    return [p[0] for p in loaded_model.predict(hacked)]

In [93]:
def compare(sequences, seq_preds):
    word_scores = {}
    for si, seq in enumerate(sequences):
        
        hacked = generate(seq) # 70x70
        preds = predict(hacked)
        
        #70x1
        for i, word in enumerate(seq):
            
            seq_score = seq_preds[si]
            
            if word not in word_scores:
                word_scores[word] = []
            
            #word_scores[word].append((seq_score, preds[i], si))
            word_scores[word].append((seq_score - preds[i], si))
    
    return word_scores

In [128]:
def seq_to_words(sequence):
    result = []
    for word_idx in sequence:
        word = index_to_word.get(word_idx, "NA")
        result.append(word)
    return " ".join(result)

In [154]:
def inspect_sample(sample_id):
    sample = padded_sequences_test[sample_id]
    pred = predict(np.array([sample]))[0]
    hacked = generate(sample)
    hacked_preds = predict(hacked)
    result = []
    for i, word_idx in enumerate(sample):
        result.append([index_to_word.get(word_idx, "NA"), pred - hacked_preds[i]])
    return seq_to_words(sample), result

In [132]:
word_scores = compare(padded_sequences_test[:1000], predict(padded_sequences_test[:1000]))

In [136]:
word_values = []
for word_index, scores in word_scores.items():
    s = sorted(scores)
    Min = s[0]
    Max = s[-1]
    word_values.append({
        'word': index_to_word.get(word_index, 'NAN'),
        'min': Min[0],
        'max': Max[0],
        'mean': np.mean(s, axis=0)[0],
        'value': sum([abs(si[0]) for si in s])/len(s),
        'count': len(s),
        'pos_count': len([s0 for s0 in s if s0[0] > 0]),
        'neg_count': len([s0 for s0 in s if s0[0] < 0]),
        'min_sample_id': Min[1],
        'max_sample_id': Max[1],
    })

In [137]:
rez = pd.DataFrame(word_values)

In [138]:
rez.shape

(7651, 10)

In [144]:
rez[rez['word'] == 'bad']

Unnamed: 0,count,max,max_sample_id,mean,min,min_sample_id,neg_count,pos_count,value,word
82,129,0.526629,213,-0.055462,-0.8364,0,102,27,0.067578,bad


In [148]:
rez[rez['value']>0.05][rez['pos_count'] > 3].sort_values(by=['value'], ascending=False)

  if __name__ == '__main__':


Unnamed: 0,count,max,max_sample_id,mean,min,min_sample_id,neg_count,pos_count,value,word
3391,5,0.456302,213,0.177153,-0.003280,262,1,4,0.178465,cure
1448,5,0.502542,213,0.115155,-0.112754,383,1,4,0.160257,opportunity
510,38,0.321842,526,-0.080531,-0.724063,879,27,11,0.137892,4
808,22,0.884046,815,0.131202,-0.016953,692,2,20,0.132895,7
981,6,0.225996,789,-0.018612,-0.443536,576,2,4,0.129286,meet
744,15,0.588446,213,-0.039198,-0.490372,718,9,6,0.127286,disappointed
1562,6,0.511646,940,0.088551,-0.086811,623,2,4,0.117523,wonderfully
1801,4,0.356187,469,0.115665,0.000590,681,0,4,0.115665,reminds
2252,4,0.425031,15,0.113830,0.000140,368,0,4,0.113830,sensitive
2930,8,0.286658,965,0.112643,-0.002414,187,1,7,0.113247,daring


In [155]:
inspect_sample(776)

("so so episodes of star trek the next generation that thinks it has something important to say you'll see every plot twist a mile off in this by the numbers romp however it's worth seeing for its portrayal of drag king prostitutes a brothel where young women pay old men to have sex with them how's that for role reversal and lesbian soap operas the ghost of valerie lives !",
 [['so', -0.12658682],
  ['so', -0.12658682],
  ['episodes', 0.060997218],
  ['of', -0.15322432],
  ['star', -0.24555662],
  ['trek', 0.081950992],
  ['the', -0.14997968],
  ['next', -0.15827957],
  ['generation', -0.02629602],
  ['that', -0.084673554],
  ['thinks', -0.19380382],
  ['it', -0.055898994],
  ['has', -0.05431062],
  ['something', -0.083150059],
  ['important', -0.010306358],
  ['to', -0.028164923],
  ['say', -0.20885143],
  ["you'll", -0.026362062],
  ['see', -0.044323981],
  ['every', -0.084624797],
  ['plot', -0.12561074],
  ['twist', 0.015751749],
  ['a', -0.047543168],
  ['mile', -0.17559072],
  ['