In [4]:
import os

In [24]:
DATA_DIR = os.environ.get("DATA_DIR", '.')

BATCH_SIZE=256
MAX_SEQUENCE_LENGTH=220


In [2]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model, Sequential, load_model
from keras.layers import Input, Flatten
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM, Bidirectional

from keras.utils.np_utils import to_categorical
from keras.callbacks import TensorBoard

import six.moves.cPickle

### load data

In [122]:
def load_data(path, base_dir=DATA_DIR, HEADER=True):
    X = []
    y = []
    INPUT_PATH = os.path.join(base_dir, path)
    with open(INPUT_PATH, "r") as f:
        if HEADER:
            header = next(f)

        for line in f:
            line = line.strip()

            if (line[0] not in ["0", "1"]) or (line[1] != "|"):
                continue

            y.append(int(line[0]))
            X.append(line[2:])
    return X,y

In [123]:
X,y = load_data("test.csv")
X[:3], y[:3]

(["it's so laddish and juvenile , only teenage boys could possibly find it funny .",
  'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .',
  '[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation .'],
 [0, 0, 0])

### load tokenizer

In [27]:
tokenizer = six.moves.cPickle.load(open("tokenizer.pkl", "rb"))
index_word = {v:k for k,v in tokenizer.word_index.items()}

In [31]:
tokenizer.word_index["test"], index_word[2296]

(2296, 'test')

In [42]:
seqs = tokenizer.texts_to_sequences([
    "the movie is good", "movie is not so bad",
    "movie is bad", "movie is not quite good"])
seqs

[[1, 18, 6, 50], [18, 6, 21, 34, 80], [18, 6, 80], [18, 6, 21, 179, 50]]

In [85]:
X_seqs = tokenizer.texts_to_sequences(X)
X_seqs[:1]

[[38, 34, 53148, 3, 3662, 61, 1588, 897, 98, 918, 167, 9, 137]]

In [111]:
X_padseqs = pad_sequences(X_seqs, maxlen=MAX_SEQUENCE_LENGTH)
X_padseqs[0][-20:]

array([    0,     0,     0,     0,     0,     0,     0,    38,    34,
       53148,     3,  3662,    61,  1588,   897,    98,   918,   167,
           9,   137], dtype=int32)

### load model

In [29]:
model = load_model("model.h5")

In [142]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 220, 100)      12858600    embedding_input_1[0][0]          
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 220, 128)      84480       embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 220, 128)      0           bidirectional_1[0][0]            
____________________________________________________________________________________________________
bidirectional_2 (Bidirectional)  (None, 220, 64)       41216       dropout_1[0][0]                  
___________________________________________________________________________________________

In [43]:
test0 = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH)
test0

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [44]:
model.predict(test0)

array([[ 0.09581654,  0.90418345],
       [ 0.64533126,  0.35466877],
       [ 0.94942302,  0.05057696],
       [ 0.73922586,  0.26077408]], dtype=float32)

In [76]:
def model_predict_text(text):
    seqs = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH)
    return model.predict(padded)[0][1]

In [117]:
(model_predict_text("sometimes awful, but overall fun"),
 model_predict_text("sometimes awful, but overall really fun"),
 model_predict_text("sometimes awful, but overall fun. And another meaningless remark."),
 model_predict_text("explicit"),
 model_predict_text("fun, but i won't watch it again"),
 model_predict_text("it's so laddish and juvenile only teenage boys could possibly find it funny"))

(0.9761017, 0.98513722, 0.57402283, 0.46442923, 0.51804101, 0.14138263)

### evaluate model

In [57]:
def seq_to_text(sequence):
    result = []
    for word_idx in sequence:
        word = index_word.get(word_idx, "NA")
        result.append(word)
    return " ".join(result)

In [78]:
def generate(sequence):
    rez = []
    for i in range(len(sequence)):
        rez.append(np.hstack(([0], sequence[:i], sequence[i+1:])))
    return np.array(rez)

In [118]:
def predict(sequences):
    return [p[1] for p in model.predict(sequences)]

In [115]:
def inspect_sample(sample_id, seqs=X_padseqs):
    sample = seqs[sample_id]
    pred = predict(np.array([sample]))[0]
    hacked = generate(sample)
    hacked_preds = predict(hacked)
    result = []
    for i, word_idx in enumerate(sample):
        result.append([index_word.get(word_idx, "NA"), pred - hacked_preds[i]])
    return seq_to_words(sample), pred , result

In [119]:
inspect_sample(0)

("NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA it's so laddish and juvenile only teenage boys could possibly find it funny",
 0.14138263,
 [['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
  ['NA', 0.0],
 

In [138]:
def evaluate_sequences(sequences, labels, alpha=0.1):
    preds = predict(sequences)
    true_pos = []
    true_neg = []
    false_pos = []
    false_neg = []
    for i,p in enumerate(preds):
        if 0.5 - alpha < p and p < 0.5 and labels[i] == 1:
            false_neg.append((i, p, labels[i]))
        elif 0.5 < p and p < 0.5 + alpha and labels[i] == 0:
            false_pos.append((i,p, labels[i]))
        elif labels[i] == 1:
            true_pos.append((i, p, labels[i]))
        else: #labels[i] == 0:
            true_neg.append((i, p, labels[i]))
    return true_pos, true_neg, false_pos, false_neg,

In [145]:
TP, TN, FP, FN = evaluate_sequences(X_padseqs, y, alpha=0.5)
[len(e) for e in (TP, TN, FP, FN)]

[4100, 4674, 656, 1230]

In [152]:
tp, tn, fp, fn = [len(e) for e in (TP, TN, FP, FN)]
acc = 1-(fp+fn)/(tp+tn)
p = tp/(tp+fp)
r = tp/(tp+fn)
((tp,tn,fp,fn),
 acc,
 p,
 r,
 2*p*r/(p+r))

((4674, 4100, 1230, 656),
 0.7850467289719626,
 0.7916666666666666,
 0.8769230769230769,
 0.832116788321168)

In [153]:
TN

[(0, 0.14138263, 0),
 (1, 0.0086632399, 0),
 (2, 0.10219195, 0),
 (3, 0.045799177, 0),
 (4, 0.053917941, 0),
 (5, 0.069409572, 0),
 (6, 0.096489474, 0),
 (7, 0.018235974, 0),
 (8, 0.22190815, 0),
 (9, 0.11412181, 0),
 (10, 0.20406656, 0),
 (11, 0.25564536, 0),
 (12, 0.29242945, 0),
 (13, 0.24344036, 0),
 (15, 0.26174015, 0),
 (17, 0.32645258, 0),
 (18, 0.089843199, 0),
 (20, 0.083085127, 0),
 (24, 0.48626477, 0),
 (25, 0.41788104, 0),
 (26, 0.49926519, 0),
 (27, 0.2134866, 0),
 (28, 0.0083200103, 0),
 (29, 0.36634442, 0),
 (30, 0.025998404, 0),
 (31, 0.29044718, 0),
 (32, 0.1152151, 0),
 (33, 0.008281433, 0),
 (35, 0.41970053, 0),
 (36, 0.14819711, 0),
 (37, 0.29928172, 0),
 (39, 0.21745844, 0),
 (40, 0.12888081, 0),
 (41, 0.25284359, 0),
 (43, 0.018773142, 0),
 (44, 0.02516433, 0),
 (45, 0.30432999, 0),
 (46, 0.45343542, 0),
 (47, 0.10790829, 0),
 (48, 0.29854131, 0),
 (49, 0.2500253, 0),
 (51, 0.24769928, 0),
 (52, 0.27798504, 0),
 (53, 0.31456733, 0),
 (54, 0.0082770837, 0),
 (55, 0