In [1]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import os
from tensorflow.contrib.rnn import  GRUCell
from tensorflow.python.ops.rnn import dynamic_rnn as rnn
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from preprocess_data import batch_generator


def get_len(dataset):
    B, R, C = dataset.shape
    dataset_seq_len = []
    for i in range(R):
        dataset_seq = dataset[:, i, :]
        seq_len = []
        for x in dataset_seq:
            if list(x)[-1]==0:
                seq_len.append(list(x).index(0))
            else:
                seq_len.append(C)
        dataset_seq_len.append(seq_len)
    return np.array(dataset_seq_len)

def get_len_(dataset):
    B, C, H = dataset.shape
    return np.full((B,C), H)
params = {"NUM_WORDS": 10000,
          "INDEX_FROM": 2,
          "SEQUENCE_LENGTH": 8,
          "NUM_SENTENCE": 10,
          "EMBEDDING_DIM": 10,
          "HIDDEN_SIZE": 15,
          "ATTENTION_SIZE": 5,
          "KEEP_PROB": 0.8,
          "BATCH_SIZE": 25,
          "NUM_EPOCHS": 10,
          "DELTA": 0.5,
          "VOCABULARY_SIZE": 10000}
data = batch_generator(batch_size=params["BATCH_SIZE"], vocabulary_size=params["VOCABULARY_SIZE"],
                       row=params["NUM_EPOCHS"], column=params["SEQUENCE_LENGTH"])

In [2]:
class AttentionModel:
    def __init__(self, params):
        self.NUM_WORDS = params["NUM_WORDS"]
        self.INDEX_FROM = params["INDEX_FROM"]
        self.SEQUENCE_LENGTH = params["SEQUENCE_LENGTH"]
        self.NUM_SENTENCE = params["NUM_SENTENCE"]
        self.EMBEDDING_DIM = params["EMBEDDING_DIM"]
        self.HIDDEN_SIZE = params["HIDDEN_SIZE"]
        self.ATTENTION_SIZE = params["ATTENTION_SIZE"]
        self.KEEP_PROB = params["KEEP_PROB"]
        self.BATCH_SIZE = params["BATCH_SIZE"]
        self.NUM_EPOCHS = params["NUM_EPOCHS"]
        self.DELTA = params["DELTA"]
        self.VOCABULARY_SIZE = params["VOCABULARY_SIZE"]
        self.input_words = tf.placeholder(tf.int32, [self.BATCH_SIZE, self.NUM_SENTENCE, self.SEQUENCE_LENGTH], name="input_words")
        self.target_words = tf.placeholder(tf.int32, [self.BATCH_SIZE], name="target_words")
        self.seq_len_pl = tf.placeholder(tf.int32, name="seq_len_pl")
        self.keep_prob_pl = tf.placeholder(tf.float32)
        self.optimizer = None
        self.loss = None
        self.accuracy = None
        self.rnn_outputs = None
    def attention(self, inputs):
        name = tf.contrib.framework.get_name_scope()
        with tf.variable_scope("{}/Attention_layer1".format(name)):
            inputs = tf.concat(inputs, 2)
            _, S, H = inputs.get_shape()          
            # Attention mechanism
            W_omega = tf.get_variable(shape=[H, self.ATTENTION_SIZE], initializer=tf.random_normal_initializer(), name="w_omega")
            b_omega = tf.get_variable(shape=[self.ATTENTION_SIZE], initializer=tf.constant_initializer(0.0), name="b_omega")
            u_omega = tf.get_variable(shape=[self.ATTENTION_SIZE], initializer=tf.random_normal_initializer(), name="u_omega")
            H, S = tf.cast(H, tf.int32), tf.cast(S, tf.int32)
            u_i = tf.tanh(tf.add(tf.matmul(tf.reshape(inputs, [-1, H]), W_omega), tf.transpose(b_omega)))
            uTu = tf.matmul(u_i, tf.reshape(u_omega, [-1, 1]))
            exps = tf.reshape(tf.exp(uTu), [-1, S])
            alphas = exps / tf.reshape(tf.reduce_mean(exps, 1), [-1, 1])
            si = tf.reduce_sum(inputs * tf.reshape(alphas, [-1, S, 1]), 1)

        return si

    def word2sentence(self, center_words, sequence_length):
        with tf.variable_scope("word2sentence"):
            embed_matrix = tf.get_variable(shape=[self.VOCABULARY_SIZE, self.EMBEDDING_DIM], initializer=tf.random_normal_initializer(),
                                       name="embed_matrix")
            embed = tf.nn.embedding_lookup(embed_matrix, center_words, name="embed")
            
            rnn_outputs, _ = bi_rnn(GRUCell(self.HIDDEN_SIZE), GRUCell(self.HIDDEN_SIZE), inputs=embed,
                                    sequence_length=sequence_length, dtype=tf.float32)
        attention_word_si = self.attention(rnn_outputs)
        return attention_word_si
    def sentence2doc(self, attention_word):
        with tf.variable_scope("sentence2doc"):
            # attention_word = tf.cast(attention_word, tf.int32)         
            rnn_outputs, _ = bi_rnn(GRUCell(self.HIDDEN_SIZE), GRUCell(self.HIDDEN_SIZE), inputs=attention_word,
                                     dtype=tf.float32)
            attention_word_si = self.attention(rnn_outputs)
        return attention_word_si
    
    def optimize(self, attention_word_si):
        drop = tf.nn.dropout(attention_word_si, self.keep_prob_pl)
        with tf.variable_scope("fc"):
            W = tf.get_variable(shape=[drop.get_shape()[1].value, 5], initializer=tf.random_normal_initializer(stddev=0.1), name="fc_weight")
            b = tf.get_variable(shape=[5], initializer=tf.constant_initializer(0.0), name="fc_bias")
            y_hat = tf.nn.xw_plus_b(drop, W, b)
            y_hat = tf.squeeze(y_hat)
        with tf.name_scope("loss"):
            target_words = tf.one_hot(self.target_words-1, depth=5)
            self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_words))
            self.optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(self.loss)
            self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_words), tf.float32))

    def train(self, data):
        attention_word = []
        with tf.variable_scope("word2sentence") as scope:
            for i in range(self.NUM_SENTENCE):
                center_words = self.input_words[:, i, :]
                sequence_length = self.seq_len_pl[i, :]
                attention_word_si = self.word2sentence(center_words, sequence_length)
                attention_word_si = tf.expand_dims(attention_word_si, axis=1)
                attention_word.append(attention_word_si)
                scope.reuse_variables()
        a = attention_word
        attention_word = tf.concat(attention_word, axis=1)
        
        s = self.sentence2doc(attention_word)
        self.optimize(s)
        with tf.Session() as sess:
            saver = tf.train.Saver()
            ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                
            sess.run(tf.global_variables_initializer())
            print("Start Training...")
            for epoch in range(1):
                loss_train = 0
                loss_test = 0
                accuracy_train = 0
                accuracy_test = 0
                print("epoch: {}\t".format(epoch), end="")
                num_batches = 1
                for b in range(num_batches):
                    x_batch, y_batch = data.__next__()
                    seq_len = get_len(x_batch)
                    loss_tr, acc, _ = sess.run([self.loss, self.accuracy, self.optimizer],
                                               feed_dict={self.input_words: x_batch, self.target_words: y_batch,
                                                          self.seq_len_pl: seq_len, self.keep_prob_pl: 0.8})
                    accuracy_train += acc
                    loss_train = loss_tr * self.DELTA + loss_train * (1 - self.DELTA)
                accuracy_train /= num_batches
                print(accuracy_train)


In [3]:
model = AttentionModel(params)
model.train(data)

Start Training...
epoch: 0	0.663999974728
