In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Flatten
from keras.layers import Dense, Embedding, LSTM, Dropout, Activation
from keras import backend as K
from deepexplain.tensorflow import DeepExplain
import pandas as pd
from keras.models import load_model
from keras.models import model_from_json

LSTM_MODEL_JSON = '../saved_model/model_lstm.json'
LSTM_MODEL_WEIGHTS = '../saved_model/model_lstm.h5'
HISTORY_FILE = '../saved_model/history_lstm.json'

def save_lstm_model(model):
    # load json and create model
    model_json = model.to_json()
    with open(LSTM_MODEL_JSON, 'w') as jsonfile:
        jsonfile.write(model_json)
    # serialize weights to HDF5
    model.save_weights(LSTM_MODEL_WEIGHTS)

def load_lstm_model(model):
    # load weights into new model
    loaded_model = model_from_json(LSTM_MODEL_JSON)
    loaded_model.load_weights(LSTM_MODEL_WEIGHTS)
    # evaluate loaded model on test data
    return loaded_model

Using TensorFlow backend.


In [2]:
df = pd.read_csv('../data/final_data_less.csv')

In [3]:
t=Tokenizer()
t.fit_on_texts(df['clean_sentiment'])

In [4]:
vocab_size = len(t.word_index)+1
encoded_docs = t.texts_to_sequences(df['clean_sentiment'])
print(vocab_size)

7051


In [5]:
labels = pd.get_dummies(df['sentiment'].values)
print(labels.shape)

(1000, 2)


In [6]:
max_length = max([len(x) for x in encoded_docs])
print(max_length)

580


In [7]:
# padding
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [8]:
split_fraction = 0.8
split_idx = int(len(padded_docs)*split_fraction)
print(split_idx)

X_train, X_test = padded_docs[:split_idx], padded_docs[split_idx:]
y_train, y_test = labels[:split_idx], labels[split_idx:]

print("Training Shape: ", X_train.shape, "== Train Lables: ", y_train.shape)
print("Test Shape: ", X_test.shape, "== Test Lables: ", y_test.shape)

800
Training Shape:  (800, 580) == Train Lables:  (800, 2)
Test Shape:  (200, 580) == Test Lables:  (200, 2)


In [9]:
current_session = K.get_session()

In [10]:
with DeepExplain(session=current_session) as de:  # <-- init DeepExplain context
    model = Sequential()
    model.add(Embedding(vocab_size,128,input_length=max_length))
    model.add(Flatten());
    model.add(Dense(100, activation='relu')); # input_shape=(max_words,)
    model.add(Dropout(0.5));
    model.add(Dense(2, activation='linear'));
#     model.add(Dense(4, activation='linear'));
    model.add(Activation('softmax'));
    model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy']);
    print(model.summary());
    model.fit(X_train, y_train,
          batch_size=10,
          epochs=5,
          validation_data=(X_test, y_test),
          verbose=1,
          shuffle=True);

    # predict on test data
    y_pred = model.predict(np.array(X_test));
    y_test = np.array(y_test);
    
    # Evaluate the embedding tensor on the model input (in other words, perform the lookup)
    embedding_tensor = model.layers[0].output
    input_tensor = model.inputs[0]
    embedding_out = current_session.run(embedding_tensor, {input_tensor: X_test});

    xs = X_test;
    ys = y_test;
    # Run DeepExplain with the embedding as input
    attributions = de.explain('elrp', model.layers[-2].output * ys, model.layers[1].input, embedding_out);
    print("attributions shape --- {}".format(attributions.shape));

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 580, 128)          902528    
_________________________________________________________________
flatten_1 (Flatten)          (None, 74240)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               7424100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 202       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total params: 8,326,830
Trainable params: 8,326,830
Non-trainable params: 0
_________________________________________________________________


In [11]:
save_lstm_model(model)
# model = load_lstm_model(model)

In [12]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 77.50%


In [14]:
attributions.shape
np.save('att.npy', attributions)

In [15]:
b = np.load('att.npy')

In [50]:
SEQUENCE_TEST = '../data/test_less.csv'
def get_test_sentence(sent_idx):
    """
    Returns a test set sentence and its label, sent_idx must be an integer in [1, 2210]"""
    _df = pd.read_csv(SEQUENCE_TEST)
    print("Shape: ",_df.shape)
#     sentence = _df['reviewText'][sent_idx]
#     print("Raw Sentiment\n")
#     print(sentence)
    sentiment = _df['sentiment'][sent_idx]
    sentence_ = _df['clean_sentiment'][sent_idx]
    sent_array = y_test[sent_idx]
    print("Clean Texts:\n")
    print(sentence_)
    print("Sentiment: ", sentiment)
    clean_words = sentence_.split()
    return clean_words, sentiment

def find_score(sent_idx):
    sent_words, sent_sentiment = get_test_sentence(sent_idx)
    scores = []
    for idx, word in enumerate(sent_words):
        print(word, ": ", b[sent_idx][idx].sum())
        scores.append(b[sent_idx][idx].sum())
    scores = np.array(scores)
    return sent_words, scores
    

In [51]:
sent_words, sent_sentiment = get_test_sentence(40)

Shape:  (200, 3)
Clean Texts:

got 3m old son although seem fit body fine wasnt able sit without assistance thought 5th month probably wont fit body practical also long height seem like tip easily even straps didnt feel comfortable using without supervises would recommend
Sentiment:  0


In [52]:
from util.heatmap import html_heatmap

import codecs
import numpy as np
from IPython.display import display, HTML

In [53]:
words_, scores_ = find_score(40)

Shape:  (200, 3)
Clean Texts:

got 3m old son although seem fit body fine wasnt able sit without assistance thought 5th month probably wont fit body practical also long height seem like tip easily even straps didnt feel comfortable using without supervises would recommend
Sentiment:  0
got :  -0.05375789
3m :  0.037802503
old :  -0.24430352
son :  -0.29903167
although :  0.080530465
seem :  0.00020494871
fit :  -0.05892004
body :  -0.047177978
fine :  0.15070361
wasnt :  -0.021080611
able :  -0.1420303
sit :  0.07942906
without :  -0.3722966
assistance :  0.040475845
thought :  0.26360533
5th :  0.0012445571
month :  0.043012396
probably :  0.07722144
wont :  0.05125931
fit :  0.12899534
body :  -0.072885126
practical :  0.06041016
also :  -0.087189354
long :  -0.02296719
height :  -0.1245781
seem :  -0.14926124
like :  -0.05637505
tip :  -0.11973221
easily :  -0.010592928
even :  -0.040554233
straps :  0.026007628
didnt :  -0.0029238444
feel :  -0.0011834865
comfortable :  -0.0707767


In [56]:
scores_

array([-5.3757891e-02,  3.7802503e-02, -2.4430352e-01, -2.9903167e-01,
        8.0530465e-02,  2.0494871e-04, -5.8920041e-02, -4.7177978e-02,
        1.5070361e-01, -2.1080611e-02, -1.4203030e-01,  7.9429060e-02,
       -3.7229660e-01,  4.0475845e-02,  2.6360533e-01,  1.2445571e-03,
        4.3012396e-02,  7.7221438e-02,  5.1259309e-02,  1.2899534e-01,
       -7.2885126e-02,  6.0410161e-02, -8.7189354e-02, -2.2967190e-02,
       -1.2457810e-01, -1.4926124e-01, -5.6375049e-02, -1.1973221e-01,
       -1.0592928e-02, -4.0554233e-02,  2.6007628e-02, -2.9238444e-03,
       -1.1834865e-03, -7.0776701e-02,  3.2041881e-02, -3.3376150e-02,
       -6.3577071e-03,  9.4616964e-02,  6.0885005e-02], dtype=float32)

In [58]:
display(HTML(html_heatmap(words_, scores_)))