# Test model

In [1]:
MODEL_NAME = "models/lstm_word2vec"
TEST_DATASET = "data/test_imdb.csv"

TOKENIZER_NAME = "models/lstm_word2vec_tokenizer"

---

In [2]:
import nltk
import numpy
import pickle
numpy.random.seed(42)

nltk.data.path.insert(0, "/dvl/w3/wn/data/.nltk_data")

from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from itertools import chain
from collections import Counter


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import model_from_json

Using TensorFlow backend.


Swith on full text mode)

In [3]:
pd.options.display.max_colwidth = -1

## Load data

In [4]:
test_data = pd.read_csv(TEST_DATASET, sep="|")

## Preprocessing

In [5]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 70

In [6]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

negatives = {
    "didn't": "didn_`_t",
    "couldn't": "couldn_`_t",
    "don't": "don_`_t",
    "wouldn't": "wouldn_`_t",
    "doesn't": "doesn_`_t",
    "wasn't": "wasn_`_t",
    "weren't": "weren_`_t",
    "shouldn't":"shouldn_`_t",
    "isn't": "isn_`_t",
    "aren't": "aren_`_t",
}

def preprocess(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    text = ' '.join(tweet_tokenizer.tokenize(text))
    for k, v in negatives.items():
        text = text.replace(k, v)
    return text

In [7]:
test_data['prep_text'] = test_data['text'].map(preprocess)

#### 1. String -> Int vector

In [8]:
 #load tokenizer
pkl_file = open(TOKENIZER_NAME, 'rb')
tokenizer = pickle.load(pkl_file)
pkl_file.close()

#### 2. Padding

In [9]:
sequences_test = tokenizer.texts_to_sequences(test_data['prep_text'])

padded_sequences_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

In [10]:
padded_sequences_test

array([[   11,     6,    87, ...,    55,    82,    17],
       [10361,   182,   175, ...,     4,     1,   320],
       [ 1375,     4,   492, ...,   149,   336,   556],
       ..., 
       [   44,    21,   201, ...,    13,   417,  5907],
       [  788, 19456,  4878, ...,     1,  6620,   181],
       [   15,    64,  3913, ...,   957,     4,  2144]], dtype=int32)

In [11]:
padded_sequences_test.shape

(10000, 70)

In [12]:
padded_sequences_test[:100]

array([[   11,     6,    87, ...,    55,    82,    17],
       [10361,   182,   175, ...,     4,     1,   320],
       [ 1375,     4,   492, ...,   149,   336,   556],
       ..., 
       [   11,     6,  1561, ...,     3,  1785,  4416],
       [   38,     2,  4315, ...,    11,   338,   177],
       [ 1825,    19,    96, ...,     3,     5,  1507]], dtype=int32)

In [13]:
padded_sequences_test.shape

(10000, 70)

In [14]:
padded_sequences_test[0]

array([   11,     6,    87,     8,     6,    19,     1,  2116,  5349,
        8931,    15,    44,    30,   953,    72,  9375,    97,    22,
          36,  8644,     3,    23, 12342,  1428,   812,    37,   225,
          26,    81,   129,  2892,    87,     5,   506,   102,    10,
          62,    25,  5862,    14,   159,   706,  1711,    98,    67,
          25,    52,    15,     1,   100,    26,     5,    25,  4387,
           3,    56,    24,   979,    11, 13373,     7,     9,    17,
           5,   285,     8,     2,    55,    82,    17], dtype=int32)

In [15]:
np.array([np.hstack(([0], padded_sequences_test[0][:3], padded_sequences_test[0][4:]))]).shape

(1, 70)

In [17]:
index_to_word = {index: word for word, index in tokenizer.word_index.items()}

## Model

In [18]:
# load json and create model
json_file = open(MODEL_NAME + '.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights(MODEL_NAME + ".hdf5")
metrics=['accuracy', 'fmeasure', 'precision', 'recall']
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
print("Loaded '%s' model from disk" % MODEL_NAME)

Loaded 'models/lstm_word2vec' model from disk


In [19]:
loaded_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 70, 300)       6000000     embedding_input_1[0][0]          
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 256)           439296      embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 256)           0           bidirectional_1[0][0]            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1)             257         dropout_1[0][0]                  
Total params: 6,439,553
Trainable params: 439,553
Non-trainable params: 6,000,000
_________

In [20]:
def padding(text):
    return pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)

In [21]:
def lstm_predict(text):
    return loaded_model.predict(padding(preprocess(text)))[0][0]

In [22]:
lstm_predict("not bad")

0.65912336

In [23]:
lstm_predict("good")

0.91656721

In [26]:
def generate(sequence):
    rez = []
    for i in range(len(sequence)):
        rez.append(np.hstack(([0], sequence[:i], sequence[i+1:])))
    return np.array(rez)

In [28]:
def predict(sequences):
    return [p[0] for p in loaded_model.predict(sequences)]

In [94]:
def compare(sequences, seq_preds, filter_spread=3):
    word_scores = {}
    for si, seq in enumerate(sequences):
        
        hacked = generate(seq) # 70x70
        preds = predict(hacked)
        
        S = np.std(preds)
        M = np.mean(preds)
        
        #70x1
        for i, word in enumerate(seq):
            
            seq_score = seq_preds[si]
            if abs(M-preds[i]) > S*filter_spread:
                
                if word not in word_scores:
                    word_scores[word] = []

                #word_scores[word].append((seq_score, preds[i], si))
                word_scores[word].append((seq_score - preds[i], si, seq_score))
    
    return word_scores

In [53]:
def seq_to_words(sequence):
    result = []
    for word_idx in sequence:
        word = index_to_word.get(word_idx, "NA")
        result.append(word)
    return " ".join(result)

In [54]:
def inspect_sample(sample_id):
    sample = padded_sequences_test[sample_id]
    pred = predict(np.array([sample]))[0]
    hacked = generate(sample)
    hacked_preds = predict(hacked)
    result = []
    for i, word_idx in enumerate(sample):
        result.append([index_to_word.get(word_idx, "NA"), pred - hacked_preds[i]])
    return seq_to_words(sample), pred , result

In [32]:
def inspect_text(text):
    sample = padding(preprocess(text))[0]
    pred = predict(np.array([sample]))[0]
    hacked = generate(sample)
    hacked_preds = predict(hacked)
    result = []
    for i, word_idx in enumerate(sample):
        result.append([index_to_word.get(word_idx, "NA"), pred - hacked_preds[i]])
    return seq_to_words(sample), pred , result

In [55]:
word_scores = compare(padded_sequences_test[:1000], predict(padded_sequences_test[:1000]))

In [125]:
def evaluate_sequences(sequences, labels, alpha=0.1):
    preds = predict(sequences)
    result = []
    for i,p in enumerate(preds):
        if 0.5 - alpha < p < 0.5 and labels[i] == 1:
            result.append((i, p, labels[i]))
        elif 0.5 < p < 0.5 + alpha and labels[i] == 0:
            result.append((i, p, labels[i]))
    return result

In [128]:
def evaluate_sequences_hard(sequences, labels, alpha=0.1):
    preds = predict(sequences)
    result = []
    for i,p in enumerate(preds):
        if p < 0.5 - alpha and labels[i] == 1:
            result.append((i, p, labels[i]))
        elif p > 0.5 + alpha and labels[i] == 0:
            result.append((i, p, labels[i]))
    return result

In [80]:
word_values = []
for word_index, scores in word_scores.items():
    s = sorted(scores)
    lower = [m[0] for m in s if m[0] < 0]
    upper = [m[0] for m in s if m[0] > 0]
    Min = np.median(lower) if len(lower)>0 else 0
    Max = np.median(upper) if len(upper)>0 else 0
    
    word_values.append({
        'word': index_to_word.get(word_index, 'NAN'),
        'min': Min,
        'max': Max,
        'value': sum([abs(si[0]) for si in s])/len(s),
        'count': len(s),
        'pos_count': len([s0 for s0 in s if s0[0] > 0]),
        'neg_count': len([s0 for s0 in s if s0[0] < 0]),
        'min_sample_id': [m[1] for m in s if m[0] < 0],
        'max_sample_id': [m[1] for m in s if m[0] > 0],
    })

In [81]:
rez = pd.DataFrame(word_values)

In [82]:
rez.shape

(609, 9)

In [83]:
rez[rez['word'] == 'bad']

Unnamed: 0,count,max,max_sample_id,min,min_sample_id,neg_count,pos_count,value,word
72,36,0.0,[],-0.055068,"[0, 951, 112, 742, 510, 937, 929, 145, 13, 208, 693, 355, 30, 584, 923, 253, 475, 642, 591, 232, 507, 981, 316, 470, 321, 580, 351, 931, 690, 387, 967, 428, 947, 41, 233, 710]",36,0,0.137424,bad


In [85]:
rez[rez['pos_count']>2].sort_values(by=['max'], ascending=False)[:60]

Unnamed: 0,count,max,max_sample_id,min,min_sample_id,neg_count,pos_count,value,word
171,3,0.203154,"[222, 103, 806]",0.0,[],0,3,0.190295,bit
14,19,0.171466,"[420, 75, 647, 17, 794, 140, 604, 130, 469, 164, 607]",-0.292189,"[394, 737, 218, 106, 713, 604, 674, 740]",8,11,0.252036,but
244,9,0.164918,"[362, 429, 196, 382, 71, 635, 464, 201, 175]",0.0,[],0,9,0.161214,enjoy
92,5,0.148635,"[548, 2, 572]",-0.127269,"[227, 551]",2,3,0.123527,never
156,11,0.120921,"[446, 720, 915, 161, 757, 885, 70, 406, 383, 859]",-0.000186,[504],1,10,0.116365,fun
19,51,0.118277,"[557, 810, 459, 809, 373, 807, 140, 891, 475, 206]",-0.086237,"[670, 123, 425, 549, 718, 613, 468, 914, 40, 972, 765, 697, 788, 331, 604, 893, 347, 237, 235, 843, 345, 515, 725, 59, 212, 772, 185, 803, 337, 835, 589, 933, 551, 29, 29, 142, 215, 448, 950, 950, 710]",41,10,0.213606,not
349,5,0.115121,"[246, 587, 953]",-0.001138,"[365, 460]",2,3,0.09885,hilarious
354,5,0.096396,"[151, 196, 799, 520]",-0.002847,[159],1,4,0.099306,favorite
344,4,0.081552,"[416, 725, 940]",-0.000107,[504],1,3,0.154319,wonderfully
96,5,0.076714,"[450, 69, 760, 761, 96]",0.0,[],0,5,0.074538,love


In [91]:
rez[rez['neg_count']>2].sort_values(by=['min'], ascending=True)[:60]

Unnamed: 0,count,max,max_sample_id,min,min_sample_id,neg_count,pos_count,value,word
146,3,0.0,[],-0.378333,"[712, 879, 44]",3,0,0.371836,can't
129,6,0.0,[],-0.351159,"[627, 960, 175, 960, 234, 821]",6,0,0.281363,didn_`_t
542,3,0.0,[],-0.324771,"[485, 892, 820]",3,0,0.266586,none
258,4,0.0,[],-0.314914,"[3, 917, 447, 58]",4,0,0.316929,wasting
524,3,0.0,[],-0.306786,"[833, 544, 50]",3,0,0.231625,sucks
14,19,0.171466,"[420, 75, 647, 17, 794, 140, 604, 130, 469, 164, 607]",-0.292189,"[394, 737, 218, 106, 713, 604, 674, 740]",8,11,0.252036,but
431,3,0.0,[],-0.282267,"[463, 631, 569]",3,0,0.32809,annoying
249,3,0.0,[],-0.258586,"[561, 380, 472]",3,0,0.217326,would
245,4,0.002916,[567],-0.256481,"[937, 73, 110]",3,1,0.158667,poor
411,3,0.0,[],-0.235088,"[914, 672, 353]",3,0,0.272168,predictable


In [134]:
r = evaluate_sequences_hard(padded_sequences_test[:1000], test_data['label'][:1000], alpha=0.15)
len(r), r

(28,
 [(8, 0.71647459, 0),
  (222, 0.78776187, 0),
  (266, 0.89216167, 0),
  (303, 0.13460688, 1),
  (322, 0.77355868, 0),
  (347, 0.081467919, 1),
  (348, 0.93970382, 0),
  (377, 0.7288968, 0),
  (384, 0.81634504, 0),
  (466, 0.85424823, 0),
  (568, 0.66720605, 0),
  (623, 0.76346838, 0),
  (639, 0.75132632, 0),
  (665, 0.90969807, 0),
  (692, 0.91442961, 0),
  (699, 0.29114857, 1),
  (705, 0.90709269, 0),
  (711, 0.96479958, 0),
  (717, 0.71042073, 0),
  (740, 0.75055361, 0),
  (796, 0.72608805, 0),
  (842, 0.66433096, 0),
  (892, 0.26279339, 1),
  (907, 0.99109006, 0),
  (920, 0.84609711, 0),
  (934, 0.049543027, 1),
  (940, 0.76944184, 0),
  (973, 0.6587919, 0)])

In [120]:
inspect_sample(568)

('very disappointed my judgment may seem harsh but i do think there is some hope for a strong national irish cinema in the near future and this simply does not back that argument as has become a recommended dublin film i was part of the audience at that screening most people would seem to be with me so that means you should probably make your own judgment of the film',
 0.66720611,
 [['very', -0.11497599],
  ['disappointed', -0.052729905],
  ['my', 0.013923943],
  ['judgment', -0.064111114],
  ['may', 0.14919835],
  ['seem', 0.12536585],
  ['harsh', 0.11860371],
  ['but', 0.058561504],
  ['i', 0.063515007],
  ['do', 0.021180034],
  ['think', 0.025861502],
  ['there', 0.02876848],
  ['is', 0.069408178],
  ['some', 0.031030178],
  ['hope', 0.07926172],
  ['for', 0.062202036],
  ['a', 0.072637081],
  ['strong', 0.048342168],
  ['national', 0.050710142],
  ['irish', 0.036400437],
  ['cinema', -0.0028896928],
  ['in', 0.046869338],
  ['the', 0.047996223],
  ['near', 0.029940724],
  ['future

In [123]:
inspect_text("very disappointed my judgment may seem harsh but i do think there is some hope for a strong national irish cinema in the near future and this simply does not back that argument as has become a recommended dublin film i was part of the audience at that screening most people would seem to be with me so that means you should probably make your own judgment")

('NA NA NA very disappointed my judgment may seem harsh but i do think there is some hope for a strong national irish cinema in the near future and this simply does not back that argument as has become a recommended dublin film i was part of the audience at that screening most people would seem to be with me so that means you should probably make your own judgment',
 0.39406794,
 [['NA', -1.4901161e-07],
  ['NA', -1.4901161e-07],
  ['NA', -1.4901161e-07],
  ['very', -0.1062963],
  ['disappointed', -0.40441883],
  ['my', -0.15496498],
  ['judgment', -0.18990052],
  ['may', -0.027907699],
  ['seem', -0.045362145],
  ['harsh', 0.02058807],
  ['but', 0.00843364],
  ['i', 0.012341768],
  ['do', -0.087359965],
  ['think', -0.075467855],
  ['there', -0.074970901],
  ['is', -0.044912547],
  ['some', -0.061132282],
  ['hope', -0.044748396],
  ['for', -0.037731797],
  ['a', -0.050005466],
  ['strong', -0.056401074],
  ['national', -0.073902398],
  ['irish', -0.064486951],
  ['cinema', -0.0886867

In [136]:
r = evaluate_sequences(padded_sequences_test[:1000], test_data['label'][:1000], alpha=0.5)
len(r), r

(58,
 [(8, 0.71647459, 0),
  (32, 0.47528979, 1),
  (45, 0.56734163, 0),
  (110, 0.3913663, 1),
  (124, 0.54713607, 0),
  (130, 0.63058877, 0),
  (161, 0.62640584, 0),
  (175, 0.53682005, 0),
  (222, 0.78776187, 0),
  (257, 0.42049652, 1),
  (260, 0.45505089, 1),
  (266, 0.89216167, 0),
  (282, 0.53731859, 0),
  (289, 0.38122183, 1),
  (303, 0.13460688, 1),
  (322, 0.77355868, 0),
  (325, 0.62232977, 0),
  (347, 0.081467919, 1),
  (348, 0.93970382, 0),
  (377, 0.7288968, 0),
  (384, 0.81634504, 0),
  (399, 0.52171767, 0),
  (412, 0.51908404, 0),
  (445, 0.56540525, 0),
  (466, 0.85424823, 0),
  (469, 0.602835, 0),
  (487, 0.59618062, 0),
  (543, 0.62423217, 0),
  (568, 0.66720605, 0),
  (581, 0.64300448, 0),
  (596, 0.58461225, 0),
  (623, 0.76346838, 0),
  (633, 0.38940978, 1),
  (639, 0.75132632, 0),
  (665, 0.90969807, 0),
  (692, 0.91442961, 0),
  (699, 0.29114857, 1),
  (705, 0.90709269, 0),
  (711, 0.96479958, 0),
  (717, 0.71042073, 0),
  (740, 0.75055361, 0),
  (764, 0.40772656

In [132]:
inspect_sample(32)

('2 what do all of these films have in common with titanic ? all of the of their 1 ratings are lower ! ! ! than titanic and none of these stinkers ever was nominated for a single award again titanic got 10 7 1 ratings ! compare that to the other 5 movies i just mentioned can there be any explanation other than the hatred of leo factor ?',
 0.47529,
 [['2', 0.10618609],
  ['what', -0.00014141202],
  ['do', 0.062810779],
  ['all', -0.007309258],
  ['of', -0.0037118495],
  ['these', 0.026364297],
  ['films', 0.097919703],
  ['have', 0.014360547],
  ['in', 0.055383056],
  ['common', 0.048955709],
  ['with', 0.071211815],
  ['titanic', 0.088108033],
  ['?', 0.058108628],
  ['all', 0.08745271],
  ['of', 0.06823808],
  ['the', 0.058000803],
  ['of', 0.064507991],
  ['their', 0.036754549],
  ['1', -0.088386834],
  ['ratings', 0.0068725944],
  ['are', 0.078748226],
  ['lower', -0.092772186],
  ['!', 0.069840401],
  ['!', 0.069840401],
  ['!', 0.069840401],
  ['than', 0.072713554],
  ['titanic',

In [137]:
r = evaluate_sequences(padded_sequences_test, test_data['label'], alpha=0.5)
len(r), r

(598,
 [(8, 0.71647459, 0),
  (32, 0.47528979, 1),
  (45, 0.56734163, 0),
  (110, 0.3913663, 1),
  (124, 0.54713607, 0),
  (130, 0.63058877, 0),
  (161, 0.62640584, 0),
  (175, 0.53682005, 0),
  (222, 0.78776187, 0),
  (257, 0.42049652, 1),
  (260, 0.45505089, 1),
  (266, 0.89216167, 0),
  (282, 0.53731859, 0),
  (289, 0.38122183, 1),
  (303, 0.13460688, 1),
  (322, 0.77355868, 0),
  (325, 0.62232977, 0),
  (347, 0.081467919, 1),
  (348, 0.93970382, 0),
  (377, 0.7288968, 0),
  (384, 0.81634504, 0),
  (399, 0.52171767, 0),
  (412, 0.51908404, 0),
  (445, 0.56540525, 0),
  (466, 0.85424823, 0),
  (469, 0.602835, 0),
  (487, 0.59618062, 0),
  (543, 0.62423217, 0),
  (568, 0.66720605, 0),
  (581, 0.64300448, 0),
  (596, 0.58461225, 0),
  (623, 0.76346838, 0),
  (633, 0.38940978, 1),
  (639, 0.75132632, 0),
  (665, 0.90969807, 0),
  (692, 0.91442961, 0),
  (699, 0.29114857, 1),
  (705, 0.90709269, 0),
  (711, 0.96479958, 0),
  (717, 0.71042073, 0),
  (740, 0.75055361, 0),
  (764, 0.4077265