In [1]:
import config
from model.cnn_document_model import DocumentModel
from preprocessing.utils import Preprocess, remove_empty_docs
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize                                     

import keras.backend as K

## Load pre-trained IMDB model and data

In [None]:
# 아래 코드를 수행하기 전에
# mkdir word2vec
# cp imdb_word2vec.txt word2vec/word2vec_50_imdb.txt
# python imdb_model.py

In [2]:
imdb_model = DocumentModel.load_model(config.MODEL_DIR+ '/imdb/model_04.json')
imdb_model.load_model_weights(config.MODEL_DIR+ '/imdb/model_04.hdf5')

model = imdb_model.get_classification_model()
model.compile(loss="binary_crossentropy", optimizer='rmsprop', metrics=["accuracy"])

Vocab Size = 33640  and the index of vocabulary words passed has 33638 words
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Tensor("final/Sigmoid:0", shape=(?, 1), dtype=float32)


In [6]:
model.targets

[<tf.Tensor 'final_target:0' shape=(?, ?) dtype=float32>]

In [7]:
import config
from dataloader.loader import Loader

train_df = Loader.load_imdb_data(directory = 'train')
print(train_df.shape)

corpus = train_df['review'].tolist()
target = train_df['sentiment'].tolist()
corpus, target = remove_empty_docs(corpus, target)
print(len(corpus))


(25000, 2)
25000


## Pre process input and compute document embeddings

In [8]:
Preprocess.NUM_SENTENCES = 20
preprocessor = Preprocess(corpus=corpus)
corpus_to_seq = preprocessor.fit()

corpus = train_df['review'].tolist()
target = train_df['sentiment'].tolist()
corpus_to_seq = preprocessor.transform(corpus)

x_train = np.array(corpus_to_seq)
y_train = np.array(target)

print(x_train.shape, y_train.shape)

Found 28654 unique tokens.
All documents processed.(25000, 600) (25000,)


In [9]:
print('Evaluating Model ...')
print(model.evaluate(x_train, y_train))

preds = model.predict(x_train)

#invert predicted label
pseudo_label = np.subtract(1,preds)

Evaluating Model ...
[0.9458074434661865, 0.50696]


## Gradient Calculation of inverted output w.r.t sentence embeddings

In [10]:
#Get the learned sentence embeddings
sentence_ebd = imdb_model.get_sentence_model().predict(x_train)

input_tensors = [model.inputs[0], # input data
                 model.sample_weights[0], # how much to weight each sample by
                 model.targets[0], # labels                 
]
#variable tensor at the sentence embeding layer
weights = imdb_model.get_sentence_model().outputs

#calculate gradient of the total model loss w.r.t 
#the variables at sentence embd layer                                     
gradients = model.optimizer.get_gradients(model.total_loss, weights) 
get_gradients = K.function(inputs=input_tensors, outputs=gradients)

Instructions for updating:
Use tf.cast instead.


In [27]:
document_number = 10
K.set_learning_phase(0)
inputs = [np.asarray([x_train[document_number]]), # X
          np.asarray([1]), # sample weights
          np.asarray([[pseudo_label[document_number][0]]]), # y
]
grad = get_gradients(inputs)

doc = corpus[document_number]
label = y_train[document_number]
prediction = preds[document_number]
print(doc, label , prediction)

sentences = sent_tokenize(doc)

sent_score = []
for i in range(len(sentences)):
    #sent_score.append((i, -np.abs(np.dot(grad[0][0][i],sentence_ebd[document_number][i])))) #DECREASING
    sent_score.append((i, -np.linalg.norm(grad[0][0][i])))

sent_score.sort(key=lambda tup: tup[1])
summary_sentences = [ i for i, s in sent_score[:4]]

for i in summary_sentences:
    print(i, sentences[i])
    


Being half-portuguese doesn't render me half-blind (nor half-prejudiced) when discussing portuguese films. Not that I get to do that very often anyway. But this film was such a rush of adrenaline! Yes, that's right - it was mostly accurate as far as history went/goes - but it pulled no punches on venturing beyond usual portuguese-film territory: things like using real locations in the middle of traffic-congested Lisbon and recruiting a real crowd to stand in for the real crowd of almost 30 years ago. And by God did they get it right! OK, to sum it up: very emotional if you've lived through it, but you'll spot minor improvements that could have been made as well as plot necessities that were. If you're just watching it randomly, you're in for a good historical romp, only of the very recent History kind and a bit more thought-proving than usual. Even by European standards, yes. 1 [0.32008418]
7 Even by European standards, yes.
1 Not that I get to do that very often anyway.
4 And by God d