In [1]:
import numpy as np
import os
import tensorflow as tf
import random
import re
from time import sleep
import math
import datetime
import time
import pickle
import layers
import functions
import decoder
import encoder
from dataset import PubMed_Dataset
import data_batcher
import models
import summarizer
import data_utils
from sys import stdout
from vocab import Vocab_Lookup

In [2]:
class Binary_Text_Classifier:
    def __init__(self, net, thresh=0.5, lr=0.001, mode='train'):
        self.thresh = thresh
        self.lr = lr
        
        self._inputs = net.inputs
        self._input_lens = net.input_lens
        self._dropout_keep_prob = net.dropout_keep_prob
        self._logits = net.logits
        self._max_len = net.max_len
        
        self._targets = tf.placeholder(tf.float32, [None, 1])
        
        assert mode in ['train', 'eval', 'infer']
        if mode == 'train':
            self._build_eval_metrics()
            self._build_optimizer()
        elif mode == 'eval':
            self._build_loss()
            self._build_eval_metrics()
        elif mode == 'infer':
            self._build_predictions()
    
    def _build_predictions(self):
        self._sigmoid = tf.nn.sigmoid(self._logits)
        self._predictions = tf.to_float(self._sigmoid >= self.thresh)
        
    def _build_loss(self):
        self._build_predictions()
        self._loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self._targets, logits=self._logits))
        
    def _build_eval_metrics(self):
        self._build_predictions()
        self._accuracy = tf.reduce_mean(tf.to_float(tf.equal(self._predictions, self._targets)))
        
    def _build_optimizer(self):
        self._build_loss()
        
        params = tf.trainable_variables()
        gradients, _ = tf.clip_by_global_norm(tf.gradients(self._loss, params), 1) 
        self._gradient_norm = tf.global_norm(gradients)
        opt_func = tf.train.AdamOptimizer(learning_rate=self.lr)
        self._optimizer = opt_func.apply_gradients(zip(gradients, params)) 
                
    def train_step(self, sess, inputs, input_lens, targets, dropout_keep_prob=1.0):
        feed_dict = {self._inputs : inputs,
                     self._input_lens : input_lens,
                     self._targets : targets,
                     self._dropout_keep_prob : dropout_keep_prob}
        run_vars = [self._loss, self._accuracy, self._gradient_norm, self._optimizer]
        
        loss, accuracy, grad_norm, _ = sess.run(run_vars, feed_dict=feed_dict)
        return loss, accuracy, grad_norm
        
    def val_step(self, sess, inputs, input_lens, targets):
        feed_dict = {self._inputs : inputs,
                     self._input_lens : input_lens,
                     self._targets : targets,
                     self._dropout_keep_prob : 1.0}
        run_vars = [self._loss, self._accuracy]
        
        loss, accuracy = sess.run(run_vars, feed_dict=feed_dict)
        return loss, accuracy
        
    def deploy(self, sess, inputs, input_lens):
        feed_dict = {self._inputs : inputs,
                     self._input_lens : input_lens,
                     self._dropout_keep_prob : 1.0}
        run_vars = self._predictions
        
        predictions = sess.run(run_vars, feed_dict=feed_dict)
        return predictions

In [3]:
meta_dir = os.path.join(os.getcwd(), 'PubMed')
log_dir = os.path.join(meta_dir, 'logs')
weights_dir = os.path.join(meta_dir, 'weights')
params_dir = os.path.join(meta_dir, 'params')
data_dir = os.path.join(meta_dir, 'data_cache')

In [4]:
#data = pickle.load(open("data.pickle", "rb"))
vocab_lookup = pickle.load(open(os.path.join(meta_dir, "vocab_lookup_30000.pickle"), "rb"))

In [5]:
train_files = []
val_files = []
test_files = []
for filename in os.listdir(data_dir):
    if 'train' in filename:
        train_files.append(os.path.join(data_dir, filename))
    elif 'val' in filename:
        val_files.append(os.path.join(data_dir, filename))
    elif 'test' in filename:
        test_files.append(os.path.join(data_dir, filename))

In [6]:
def data_partition_loader(partition_files):
    i = 0
    while True:
        partition_file = partition_files[i]
        i += 1
        yield pickle.load(open(partition_file, 'rb'))

In [7]:
train_partition_loader = data_partition_loader(train_files)
val_partition_loader = data_partition_loader(val_files)
test_partition_loader = data_partition_loader(test_files)

train_data = next(train_partition_loader)
val_data = next(val_partition_loader)
test_data = next(test_partition_loader)

In [8]:
batch_size = 128
train_batcher = data_batcher.Data_Batcher(train_data, batch_size)
val_batcher = data_batcher.Data_Batcher(val_data, batch_size)
test_batcher = data_batcher.Data_Batcher(test_data, batch_size)
deploy_batcher = data_batcher.Data_Batcher(val_data, 1)

In [43]:
d_pad_len = 150
s_pad_len = 20
embd_dim = 100
hidden_size = 256
n_layers = 4
vocab_size = vocab_lookup.num_words
dropout_keep_prob = 0.8
bidirectional = False

display_interval = 100
val_interval = 1000
deploy_interval = 1000
n_iters = 100000

lr = 0.001
DEVICE = 1
USE_CUDA = True
DEBUG_MODE = False

# w2v = pickle.load(open("w2v_CNN-Dailymail_100.pickle", "rb"))

In [44]:
pretrained_embeddings = None #functions.create_embeddings(vocab_lookup, w2v)    

In [45]:
device_name = '/gpu:{}'.format(DEVICE) if USE_CUDA else '/cpu:{}'.format(DEVICE)

if USE_CUDA:
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(DEVICE)

tf.reset_default_graph()
with tf.device(device_name):
    net = models.RNN_Classifier(1, vocab_size, s_pad_len, embedding_dim=embd_dim, hidden_size=hidden_size, n_layers=n_layers, 
                                bidirectional=bidirectional, pretrained_embeddings=pretrained_embeddings, trainable_embeddings=True)
    model = Binary_Text_Classifier(net, thresh=0.5, lr=lr, mode='train')
    functions.count_params(tf.trainable_variables())
    for var in tf.trainable_variables(): print(var)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


# Trainable Parameters: 4456385
<tf.Variable 'embeddings_layer/embeddings:0' shape=(30000, 100) dtype=float32_ref>
<tf.Variable 'encoder/multi_rnn_cell/cell_0/gru_cell/gates/kernel:0' shape=(356, 512) dtype=float32_ref>
<tf.Variable 'encoder/multi_rnn_cell/cell_0/gru_cell/gates/bias:0' shape=(512,) dtype=float32_ref>
<tf.Variable 'encoder/multi_rnn_cell/cell_0/gru_cell/candidate/kernel:0' shape=(356, 256) dtype=float32_ref>
<tf.Variable 'encoder/multi_rnn_cell/cell_0/gru_cell/candidate/bias:0' shape=(256,) dtype=float32_ref>
<tf.Variable 'encoder/multi_rnn_cell/cell_1/gru_cell/gates/kernel:0' shape=(512, 512) dtype=float32_ref>
<tf.Variable 'encoder/multi_rnn_cell/cell_1/gru_cell/gates/bias:0' shape=(512,) dtype=float32_ref>
<tf.Variable 'encoder/multi_rnn_cell/cell_1/gru_cell/candidate/kernel:0' shape=(512, 256) dtype=float32_ref>
<tf.Variable 'encoder/multi_rnn_cell/cell_1/gru_cell/candidate/bias:0' shape=(256,) dtype=float32_ref>
<tf.Variable 'encoder/multi_rnn_cell/cell_2/gru_cell/

In [None]:
def replace_words(ids, real_len, rate):
    mask = [1 if random.random() < rate else 0 for _ in range(real_len-1)] + [0]*(len(ids)-real_len+1)
    cond = (np.array(mask) == 1)
    random_vals = [random.randint(4, vocab_size) for _ in range(len(ids))]
    return np.where(cond, random_vals, ids)

def repeat_words(ids, real_len, rate):
    mask = [1 if random.random() < rate else 0 for _ in range(real_len-1)] + [0]*(len(ids)-real_len+1)
    shift = [ids[0]] + ids[:-1]
    cond = (np.array(mask) == 1)
    return np.where(cond, shift, ids)

def single_repeated_word(ids, real_len, rate):
    mask = [1 if random.random() < rate else 0 for _ in range(real_len-1)] + [0]*(len(ids)-real_len+1)
    index = random.randint(0, real_len-1)
    inclusion = [ids[index] for _ in range(len(ids))]
    cond = (np.array(mask) == 1)
    return np.where(cond, inclusion, ids)

def shuffle_sequence(ids, real_len):
    return random.sample(ids[:real_len-1], real_len-1) + ids[real_len-1:]

def get_labeled_examples(examples, rate=0.15):
    inputs = []
    targets = []
    for example in examples:
        label = random.randint(0, 1)
        if label == 0:
            aug_type = random.randint(0, 3)
            if aug_type == 0:
                input_ids = shuffle_sequence(example.target_ids, example.target_len)
            elif aug_type == 1:
                input_ids = replace_words(example.target_ids, example.target_len, rate)
            elif aug_type == 2:
                input_ids = repeat_words(example.target_ids, example.target_len, rate)
            elif aug_type == 3:
                input_ids = single_repeated_word(example.target_ids, example.target_len, rate)
        else:
            input_ids = example.target_ids
        inputs.append(input_ids)
        targets.append([label])
    return inputs, targets

In [None]:
params = {key : value for key, value in net.__dict__.items() if not key.startswith('__') and not key.startswith('_')
          and not callable(key) and str(type(value)).find('tensorflow') == -1}
model_name = net.__class__.__name__

if not DEBUG_MODE:
    timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S')
    log_file = os.path.join(log_dir, '{}_train_log_{}.txt'.format(model_name, timestamp))
    log_description = '0.8 dropout, classifier on titles, params: {}\n'.format(params)
    log = open(log_file, 'w')
    log.close()
    functions.write_to_log(log_description, log_file)
    
    params_filename = '{}_params_{}.pickle'.format(model_name, timestamp)
    with open(os.path.join(params_dir, params_filename), 'wb') as handle:
        pickle.dump(params, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    checkpoint_dir = os.path.join(weights_dir, '{}_checkpoints_{}'.format(model_name, timestamp))
    os.mkdir(checkpoint_dir)

epoch = 0
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    sess.run(tf.global_variables_initializer())
    
    if not DEBUG_MODE:
        saver = tf.train.Saver(max_to_keep=100)
     
    best_val_loss = 10e6
    for itr in range(1, n_iters+1):            
        examples, ep = train_batcher.next_batch()
        if ep == 1:
            try:
                train_data = next(train_partition_loader)
            except:
                epoch += 1
                train_partition_loader = data_partition_loader(train_files)
                train_data = next(train_partition_loader)
            train_batcher = data_batcher.Data_Batcher(train_data, batch_size)
            examples, ep = train_batcher.next_batch()
            
        inputs, targets = get_labeled_examples(examples)
        input_lens = [example.target_len for example in examples]

        train_loss, train_acc, grad_norm = model.train_step(sess, inputs, input_lens, targets, 
                                                            dropout_keep_prob=dropout_keep_prob)

        if itr % display_interval == 0 or itr == 1:     
            log_string = ('[%d, %5d] loss: %.3f, accuracy: %.3f, grad_norm: %.3f' 
                          % (epoch, itr, train_loss, train_acc, grad_norm))

            if not DEBUG_MODE:
                functions.write_to_log(log_string, log_file)
            print(log_string)
            
        if itr % val_interval == 0:
            val_loss, val_acc = 0.0, 0.0 
            for i in range(int(len(val_batcher.data)/val_batcher.batch_size)):
                examples, ep = val_batcher.next_batch()
                if ep == 1:
                    try:
                        val_data = next(val_partition_loader)
                    except:
                        val_partition_loader = data_partition_loader(val_files)
                        val_data = next(val_partition_loader)
                    val_batcher = data_batcher.Data_Batcher(val_data, batch_size)
                    examples, ep = val_batcher.next_batch()
                inputs, targets = get_labeled_examples(examples)
                input_lens = [example.target_len for example in examples]

                val_batch_loss, val_batch_acc = model.val_step(sess, inputs, input_lens, targets)
                val_loss += ((val_batch_loss - val_loss)/(i+1))
                val_acc += ((val_batch_acc - val_acc)/(i+1))
                #val_loss.append(val_batch_loss)
                #val_acc.append(val_batch_acc)
                if (i+1)*val_batcher.batch_size >= 10000:
                    break
            #val_loss = np.mean(val_loss)
            #val_acc = np.mean(val_acc)
            log_string = ('Validation - loss: %.3f, accuracy: %.3f' % (val_loss, val_acc))

            if not DEBUG_MODE:
                functions.write_to_log(log_string, log_file)
            print(log_string)
            
            if not DEBUG_MODE:
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    weights_prefix = '{}_weights_epoch_{}_itr_{}'.format(model_name, epoch, itr)
                    log_msg = "Weights saved in file: {}\n".format(os.path.join(checkpoint_dir, weights_prefix))
                    print(log_msg)
                    saver.save(sess, os.path.join(checkpoint_dir, weights_prefix))
                    functions.write_to_log(log_msg, log_file)
            
        if itr % deploy_interval == 0:
            examples, _ = deploy_batcher.next_batch()
            example = examples[0]
            shuffled_input = shuffle_sequence(example.target_ids, example.target_len)
            replaced_input = replace_words(example.target_ids, example.target_len, 0.15)
            repeated_input = repeat_words(example.target_ids, example.target_len, 0.15)
            single_repeated_input = single_repeated_word(example.target_ids, example.target_len, 0.15)
            inputs = [example.target_ids, shuffled_input, replaced_input, repeated_input, single_repeated_input]
            input_lens = [example.target_len]*5
            
            replaced_sentence = ' '.join([vocab_lookup.convert_id2word(idx) for idx in replaced_input])
            shuffled_sentence = ' '.join([vocab_lookup.convert_id2word(idx) for idx in shuffled_input])
            repeated_sentence = ' '.join([vocab_lookup.convert_id2word(idx) for idx in repeated_input])
            single_repeated_sentence = ' '.join([vocab_lookup.convert_id2word(idx) for idx in single_repeated_input])
            predictions = model.deploy(sess, inputs, input_lens)
            
            log_string = ('INPUT:\n{}\nSHUFFLED:\n{}\nREPLACED:\n{}\nREPEATED:\n{}\nSINGLE REPEAT:\n{}\nMODEL: {}'
                          .format(example.target_text, shuffled_sentence, replaced_sentence, repeated_sentence, 
                                  single_repeated_sentence, predictions))
            if not DEBUG_MODE:
                functions.write_to_log(log_string, log_file)
            print(log_string)

[0,     1] loss: 0.688, accuracy: 0.500, grad_norm: 0.696
[0,   100] loss: 0.695, accuracy: 0.547, grad_norm: 0.711
[0,   200] loss: 0.664, accuracy: 0.594, grad_norm: 0.863
[0,   300] loss: 0.674, accuracy: 0.570, grad_norm: 0.892
[0,   400] loss: 0.581, accuracy: 0.680, grad_norm: 0.737
[0,   500] loss: 0.652, accuracy: 0.609, grad_norm: 0.979
[0,   600] loss: 0.649, accuracy: 0.562, grad_norm: 1.000
[0,   700] loss: 0.627, accuracy: 0.625, grad_norm: 0.714
[0,   800] loss: 0.653, accuracy: 0.594, grad_norm: 0.939
[0,   900] loss: 0.641, accuracy: 0.641, grad_norm: 0.563
[0,  1000] loss: 0.663, accuracy: 0.656, grad_norm: 1.000
Validation - loss: 0.590, accuracy: 0.663
Weights saved in file: /home/ai2-leia/Documents/code/paul/deep_NLP/Summarization/PubMed/weights/RNN_Classifier_checkpoints_20171117_114517/RNN_Classifier_weights_epoch_0_itr_1000

INPUT:
Six-microns-under acts upstream of Draper in the glial phagocytosis of apoptotic neurons.
SHUFFLED:
microns UNK apoptotic neurons act