In [10]:
# Copyright 2018 @Jacob Su Wang. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

import os
import time
import random
import shutil
import dill
import numpy as np

import tensorflow as tf

from helpers import Indexer, checkpoint_model
from itertools import chain, product
from collections import defaultdict, Counter


def causal_conv1d(inputs, 
                  filters,
                  kernel_size,
                  strides=1, 
                  dilation_rate=1, 
                  activation=None, 
                  use_bias=True, 
                  kernel_initializer=None, 
                  bias_initializer=tf.zeros_initializer(), 
                  kernel_regularizer=None, 
                  bias_regularizer=None, 
                  activity_regularizer=None, 
                  trainable=True, 
                  name=None,
                  reuse=None, # i added this.
                  **kwargs):
    
    padding = (kernel_size - 1) * dilation_rate
    inputs_padded = tf.pad(inputs, [[0,0],[padding,0],[0,0]])
        # use this rather than tf.pad(.., tf.constant) 
        # to have .pad's shape fully specified.
    return tf.layers.conv1d(inputs_padded,
                            filters=filters,
                            kernel_size=kernel_size,
                            strides=strides,
                            padding='valid',
                            data_format='channels_last',
                            dilation_rate=dilation_rate,
                            activation=activation,
                            use_bias=use_bias,
                            kernel_initializer=kernel_initializer,
                            bias_initializer=bias_initializer,
                            kernel_regularizer=kernel_regularizer,
                            bias_regularizer=bias_regularizer,
                            activity_regularizer=activity_regularizer,
                            trainable=trainable,
                            name=name,
                            reuse=reuse)

def temporal_block(inputs, 
                   filters, 
                   kernel_size,
                   strides,
                   dilation_rate,
                   conv_block_name,
                   rate,
                   training=True):
    
    conv1 = causal_conv1d(inputs, 
                          filters, 
                          kernel_size, 
                          strides, 
                          dilation_rate,
                          activation=tf.nn.relu,
                          name=conv_block_name+'-conv1')
    conv1_norm = tf.contrib.layers.batch_norm(conv1)
    conv1_dropout = tf.layers.dropout(conv1_norm, 
                                      rate=rate, 
                                      training=training)
    conv2 = causal_conv1d(conv1_dropout,
                          filters, 
                          kernel_size, 
                          strides, 
                          dilation_rate,
                          activation=tf.nn.relu,
                          name=conv_block_name+'-conv2')
    conv2_norm = tf.contrib.layers.batch_norm(conv2)
    conv2_dropout = tf.layers.dropout(conv2_norm, 
                                      rate=rate, 
                                      training=training)
    
    inputs_idconv = tf.layers.conv1d(inputs, filters=filters, kernel_size=1, name=conv_block_name+'-id')
    
    return tf.nn.relu(conv2_dropout + inputs_idconv)
        # residual link: relu(transformed + identity_conv(original))

def fully_connected(inputs, 
                    out_dim,
                    activation=None,
                    initializer=None,
                    name=None):
    
    return tf.layers.dense(inputs,
                           out_dim,
                           activation=activation,
                           kernel_initializer=initializer,
                           name=name)

def vdcnn(inputs,
          blocks_list,
          kernel_size,
          rate,
          block_names_list):
    
    outputs = inputs
    for blocks,block_names in zip(blocks_list,block_names_list):
        n_block = len(blocks)
        for i in range(n_block):
            dilation_rate = 2**i
            filters = blocks[i]
            outputs = temporal_block(outputs, 
                                     filters,
                                     kernel_size,
                                     strides=1,
                                     dilation_rate=dilation_rate,
                                     conv_block_name=block_names[i],
                                     rate=rate)
        outputs = tf.layers.max_pooling1d(outputs, pool_size=2, strides=2)
    
    fc_dim = int(blocks_list[-1][-1]/2)
    for i in range(2):
        outputs = fully_connected(outputs, 
                                  out_dim=fc_dim, 
                                  activation=tf.nn.relu,
                                  initializer=tf.contrib.layers.xavier_initializer(),
                                  name=str(i)) # NB: weight layers must be named to be reused!!
    
    return outputs[:,-1,:]

class PairwiseVDCNN:
    
    def __init__(self, config):
        
        self.config = config
        self.data_dir = self.config['data_dir']
        self.info_path = self.config['info_path']
        self.FILENAMES = os.listdir(self.data_dir)
            # each file is a '.p' filename.
        self.indexer, self.word2emb = dill.load(open(self.info_path, 'rb'))
            # indexer: Indexer object, word<->index mapping.
            # word2emb: word->glove-embedding mapping (300D). 
        self.model_dir = self.config['model_dir']
        self.model_name = self.config['model_name']
        self.init_with_glove = self.config['init_with_glove']
        
        self.load_from_saved = self.config['load_from_saved']
        
        self.vocab_size = self.config['vocab_size']
        self.emb_size = self.config['emb_size']
        self.batch_size = self.config['batch_size']
        self.max_len = self.config['max_len']
        self.kernel_size = self.config['kernel_size']
        self.dropout_rate = self.config['dropout_rate']
        self.learning_rate = self.config['learning_rate']

        if self.init_with_glove:
            glove_embs = []
            for i in range(len(self.indexer)):
                glove_embs.append(self.word2emb[self.indexer.get_object(i)])
            self.glove_embs = np.array(glove_embs)
        else:
            del self.word2emb

        if self.load_from_saved:
            self.__load_saved_graph()
            print('Model loaded for continued training!')
        else:
            self.__build_new_graph()  
            print('New model built for training!')
            
    def __build_new_graph(self):

        tf.reset_default_graph()
        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=sess_config)
        
        self.__init_place_holders()
        self.__init_embeddings()
        self.__run_vdcnn()
        self.__run_score_and_predictions()
        self.__run_accuracy()
        self.__run_optimization()
        
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()        
        
    def __init_place_holders(self):    
        
        self.input_x1 = tf.placeholder(tf.int32, [None, self.max_len], name='input_x1') # <bc,mt>
        self.input_x2 = tf.placeholder(tf.int32, [None, self.max_len], name='input_x2')
        self.input_y = tf.placeholder(tf.int32, [None], name='input_y')
    
    def __init_embeddings(self):
        
        with tf.variable_scope('Embeddings'):
            self.embeddings = tf.get_variable('embeddings', [self.vocab_size, self.emb_size], 
                                         initializer=tf.contrib.layers.xavier_initializer())
            if self.init_with_glove:
                glove_init = self.embeddings.assign(self.glove_embs)
            self.input_x1_embedded = tf.nn.embedding_lookup(self.embeddings, self.input_x1) 
                # <bc,mt,emb>
            self.input_x2_embedded = tf.nn.embedding_lookup(self.embeddings, self.input_x2)
            
    def __run_vdcnn(self):
        
        with tf.variable_scope('VDCNN') as scope:
            self.input_x1_vdcnn = vdcnn(self.input_x1_embedded,
                                        blocks_list=[[256,256],[512,512]],
                                        kernel_size=self.kernel_size, 
                                        rate=self.dropout_rate,
                                        block_names_list=[['l-256-1','l-256-2'],
                                                          ['l-512-1','l-512-2']])
                # <bc,out-dim>
            scope.reuse_variables()  
            self.input_x2_vdcnn = vdcnn(self.input_x2_embedded,
                                        blocks_list=[[256,256],[512,512]],
                                        kernel_size=self.kernel_size, 
                                        rate=self.dropout_rate,
                                        block_names_list=[['l-256-1','l-256-2'],
                                                          ['l-512-1','l-512-2']])        

    def __run_score_and_predictions(self):
        
        W_bi = tf.get_variable('W_bi', [256,256], # final conv layer is halves as per the paper. 
                               initializer=tf.contrib.layers.xavier_initializer())
        self.scores = tf.nn.sigmoid(tf.diag_part(tf.matmul(tf.matmul(self.input_x1_vdcnn,W_bi),
                                                      tf.transpose(self.input_x2_vdcnn))),name='scores')
        self.predictions = tf.cast(tf.round(self.scores), tf.int32, name='predictions')  
        
    
    def __run_accuracy(self):

        with tf.name_scope('Accuracy'):
            correct_predictions = tf.equal(self.predictions, self.input_y)
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')
    
    def __run_optimization(self):

        with tf.name_scope('Loss'):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(self.input_y, tf.float32), 
                                                             logits=self.scores)
            self.loss = tf.reduce_mean(losses, name='loss')  

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        grads_and_vars = optimizer.compute_gradients(self.loss)
        self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step, name='train_op')    
    
    def __load_saved_graph(self):
        
        tf.reset_default_graph()
        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=sess_config)
        self.saver = tf.train.import_meta_graph(self.model_dir + self.model_name + '.meta')
        self.saver.restore(self.sess, tf.train.latest_checkpoint(self.model_dir))
        self.graph = tf.get_default_graph() 
        
        self.input_x1 = self.graph.get_tensor_by_name('input_x1:0')
        self.input_x2 = self.graph.get_tensor_by_name('input_x2:0')
        self.input_y = self.graph.get_tensor_by_name('input_y:0')
        
        self.scores = self.graph.get_tensor_by_name('scores:0')
        self.predictions = self.graph.get_tensor_by_name('predictions:0')
        self.loss = self.graph.get_tensor_by_name('Loss/loss:0')
        self.accuracy = self.graph.get_tensor_by_name('Accuracy/accuracy:0')
        self.global_step = self.graph.get_tensor_by_name('global_step:0')
        self.train_op = self.graph.get_tensor_by_name('train_op:0')
        
class DataBatcher:
    
    def __init__(self, data_dir, batch_size, max_len):
        
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.max_len = max_len
        
    def __procrustize(self, sent):
        
        s_len = len(sent)
        
        return sent[:self.max_len] if s_len>=self.max_len else sent+[0]*(self.max_len-s_len)   
    
    def get_batch(self, filename):
        
        doc_a, doc_b, _ = dill.load(open(self.data_dir+filename, 'rb'))
        batch_x1, batch_x2, batch_y = [], [], []
        ys = [1,0,0,1]
        for _ in range(self.batch_size//4): # 4 entries added per iteration.
            for i,(da,db) in enumerate(product([doc_a, doc_b], 
                                               [doc_a, doc_b])):
                batch_x1.append(self.__procrustize(random.choice(da)))
                batch_x2.append(self.__procrustize(random.choice(db)))
                batch_y.append(ys[i])
                
        return np.array(batch_x1), np.array(batch_x2), np.array(batch_y)  
    

def train_pairwise_vdcnn(model_config,
                         track_dir,
                         session_id,
                         new_track,
                         n_epoch,
                         train_size,
                         verbose,
                         save_freq):
    
    clf = PairwiseVDCNN(model_config)
 
    dat = DataBatcher(clf.data_dir, clf.batch_size, clf.max_len)
    
    log_mode = 'w' if new_track else 'a'
    with open(track_dir+session_id+'.txt', log_mode) as f:
        f.write('\n\n== NEW SESSION ==\n\n')
    loss_track, accuracy_track = [], []
    start = time.time()    
    try:
        for e in range(n_epoch):
            with open(track_dir+session_id+'.txt', 'a') as f:
                f.write('Epoch '+str(e+1)+'\n')
            file_indices = np.random.choice(list(range(len(clf.FILENAMES))),
                                            size=train_size, replace=False)
            random.shuffle(file_indices)
            curr_loss_track, curr_accuracy_track = [], []
            for file_idx in file_indices:
                try: # handle bad files show there be any.
                    batch_x1,batch_x2,batch_y = dat.get_batch(clf.FILENAMES[file_idx])
                except:
                    continue
                fd = {clf.input_x1:batch_x1, clf.input_x2:batch_x2, clf.input_y:batch_y}
                _,step,loss_,accuracy_ = clf.sess.run([clf.train_op,clf.global_step,
                                                       clf.loss,clf.accuracy], feed_dict=fd)
                curr_loss_track.append(loss_)
                curr_accuracy_track.append(accuracy_)
                if step % save_freq == 0:
                    checkpoint_model(clf.model_dir, clf.model_dir+clf.model_name,
                                     clf.saver, clf.sess)
                if step % verbose == 0:
                    with open(track_dir+session_id+'.txt', 'a') as f:
                        avg_loss = np.mean(curr_loss_track)
                        avg_acc = np.mean(curr_accuracy_track)
                        loss_track.append(avg_loss)
                        accuracy_track.append(avg_acc)
                        f.write('loss & accuracy at step {}: <{:.5f}, {:.2f}> (time elapsed = {:.2f} secs)\n'.format(step, 
                                                                                        avg_loss,
                                                                                        avg_acc,
                                                                                        time.time()-start,2))
                    start = time.time()
                    curr_loss_track, curr_accuracy_track = [], []
        with open(track_dir+session_id+'-final.txt', log_mode) as f:
            f.write('final avg loss & accuracy: <{:.5f}, {:.5f}>'.format(np.mean(loss_track),
                                                                         np.mean(accuracy_track)))
    except KeyboardInterrupt:
        print('Stopped!')   
        

if __name__ == "__main__":
    
    import argparse
    parser = argparse.ArgumentParser()
    # model config
    parser.add_argument('--data_dir', type=str, default="/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_code/")
    parser.add_argument('--info_path', type=str, default="/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/indexer_word2emb_100k.p")
    parser.add_argument('--model_dir', type=str, default="/work/04233/sw33286/AIDA-CNN-MODEL-SAVE/our-model-no-context/")
    parser.add_argument('--model_name', type=str, default="vdcnn-no-context")
    parser.add_argument('--init_with_glove', type=bool, default=True) 
    parser.add_argument('--load_from_saved', type=bool, default=False)
    parser.add_argument('--vocab_size', type=int, default=100001)
    parser.add_argument('--emb_size', type=int, default=300)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--max_len', type=int, default=32)
    parser.add_argument('--kernel_size', type=int, default=3)
    parser.add_argument('--dropout_rate', type=float, default=0.2)
    parser.add_argument('--learning_rate', type=float, default=1e-5)
    # train config
    parser.add_argument('--track_dir', type=str, default="/work/04233/sw33286/AIDA-TRACKS/sentence-tracks-cnn/no-context/")
    parser.add_argument('--session_id', type=str, default='0000')
    parser.add_argument('--new_track', type=bool, default=True)
    parser.add_argument('--n_epoch', type=int, default=1)
    parser.add_argument('--train_size', type=int, default=10)
    parser.add_argument('--verbose', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=5)
    args = parser.parse_args()
    
    config = {'data_dir':args.data_dir, 'info_path':args.info_path, 
              'model_dir':args.model_dir, 'model_name':args.model_name,
              'init_with_glove':args.init_with_glove, 'load_from_saved':args.load_from_saved,
              'vocab_size':args.vocab_size, 'emb_size':args.emb_size,
              'batch_size':args.batch_size, 'max_len':args.max_len,
              'kernel_size':args.kernel_size, 'dropout_rate':args.dropout_rate,
              'learning_rate':args.learning_rate}
    
    train_pairwise_vdcnn(model_config=config,
                         track_dir=args.track_dir,
                         session_id=args.session_id,
                         new_track=args.new_track,
                         n_epoch=args.n_epoch,
                         train_size=args.train_size,
                         verbose=args.verbose,
                         save_freq=args.save_freq)
        

In [3]:
!pwd
!ls

/home/04233/sw33286/DEV-CNN
[DEMO] TCN (MNIST).ipynb	 FIGS
[DEMO] TCN (nyt).ipynb		 pairwise_vdcnn.py
[DEVELOPMENT] TCN (MNIST).ipynb  tcn_mnist_demo.py
[DEVELOPMENT] TCN (nyt).ipynb


### Tests

In [26]:
# train new model

!python3 pairwise_vdcnn.py --train_size=10 --verbose 1 --save_freq 5

I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties: 
name: Tesla K40m
major: 3 minor: 5 memoryClockRate (GHz) 0.745
pciBusID 0000:08:00.0
Total memory: 11.17GiB
Free memory: 6.96GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0:   Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bus

In [27]:
# load from saved model

!python3 pairwise_vdcnn.py --train_size=10 --verbose 1 --save_freq 5 --load_from_saved 1

I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties: 
name: Tesla K40m
major: 3 minor: 5 memoryClockRate (GHz) 0.745
pciBusID 0000:08:00.0
Total memory: 11.17GiB
Free memory: 6.96GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0:   Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bus