In [0]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
the_path = '/content/gdrive/My Drive/master_thesis/'
! ls gdrive/My\ Drive/master_thesis

aisin_gioro_models		     glove.840B.300d.txt
codes				     model
data				     papers
extended_relation_descriptions.json  relation_descriptions.json


In [0]:
# download multiNLI data
# ! wget "https://www.nyu.edu/projects/bowman/multinli/multinli_0.9.zip"
# ! unzip multinli_0.9.zip
# ! cp -r multinli_0.9 gdrive/My\ Drive/master_thesis/data/

In [0]:
# logger
import datetime
import sys
import json

class Logger(object):
    """
    A logging that doesn't leave logs open between writes, so as to allow AFS synchronization.
    """

    # Level constants
    DEBUG = 0
    INFO = 1
    WARNING = 2
    ERROR = 3

    def __init__(self, log_path=None, json_log_path=None, min_print_level=0, min_file_level=0):
        """
        log_path: The full path for the log file to write. The file will be appended to if it exists.
        min_print_level: Only messages with level above this level will be printed to stderr.
        min_file_level: Only messages with level above this level will be written to disk.
        """
        self.log_path = log_path
        self.json_log_path = json_log_path
        self.min_print_level = min_print_level
        self.min_file_level = min_file_level

    def Log(self, message, level=INFO):
        if level >= self.min_print_level:
            # Write to STDERR
            sys.stderr.write("[%i] %s\n" % (level, message))
        if self.log_path and level >= self.min_file_level:
            # Write to the log file then close it
            with open(self.log_path, 'a') as f:
                datetime_string = datetime.datetime.now().strftime(
                    "%y-%m-%d %H:%M:%S")
                f.write("%s [%i] %s\n" % (datetime_string, level, message))

    def LogJSON(self, message_obj, level=INFO):
        if self.json_log_path and level >= self.min_file_level:
            with open(self.json_log_path, 'w') as f:
                print >>f, json.dumps(message_obj)
        else:
            sys.stderr.write('WARNING: No JSON log filename.')


In [0]:
import numpy as np
import re
import random
import json
import collections
import pickle
import os
import importlib

NLI_LABEL_MAP = {
    "entailment": 0,
    "contradiction": 1,
    "hidden": 0
}

LABEL_MAP = {
    "entailment": 0,
    "neutral": 1,
    "contradiction": 1,
    "hidden": 0
}

# parameter

FIXED_PARAMETERS = {}

FIXED_PARAMETERS["seq_length"] = 50
FIXED_PARAMETERS["batch_size"] = 16
FIXED_PARAMETERS["keep_rate"] = 0.9
FIXED_PARAMETERS["learning_rate"] = 0.0004
FIXED_PARAMETERS["word_embedding_dim"] = 300
FIXED_PARAMETERS["hidden_embedding_dim"] = 300
FIXED_PARAMETERS["emb_train"] = True

FIXED_PARAMETERS["embedding_data_path"] = the_path + "glove.840B.300d.txt"
FIXED_PARAMETERS["log_path"] = the_path + "model/"
FIXED_PARAMETERS["ckpt_path"] = the_path + "model/"

FIXED_PARAMETERS["training_mnli"] = the_path + "data/multinli_0.9/multinli_0.9_train.jsonl"
FIXED_PARAMETERS["dev_matched"] = the_path + "data/multinli_0.9/multinli_0.9_dev_matched.jsonl"
FIXED_PARAMETERS["dev_mismatched"] = the_path + "data/multinli_0.9/multinli_0.9_dev_mismatched.jsonl"
FIXED_PARAMETERS["training_uwre"] = the_path + "data/uwre/train.0"
FIXED_PARAMETERS["dev_uwre"] = the_path + "data/uwre/dev.0"
FIXED_PARAMETERS["test_uwre"] = the_path + "data/uwre/test.0"


FIXED_PARAMETERS["description_num"] = 15
FIXED_PARAMETERS["model_type"] = 'mcim'
FIXED_PARAMETERS["model_name"] = 'pretrain_mlp_15'

modname = FIXED_PARAMETERS["model_name"] #'pre_train_mlp'
logpath = os.path.join(FIXED_PARAMETERS["log_path"], modname) + ".log"
logger = Logger(logpath)


PADDING = "<PAD>"
UNKNOWN = "<UNK>"


In [0]:
# blocks
"""

Functions and components that can be slotted into tensorflow models.

TODO: Write functions for various types of attention.

"""

import tensorflow as tf


def length(sequence):
    """
    Get true length of sequences (without padding), and mask for true-length in max-length.

    Input of shape: (batch_size, max_seq_length, hidden_dim)
    Output shapes, 
    length: (batch_size)
    mask: (batch_size, max_seq_length, 1)
    """
    populated = tf.sign(tf.abs(sequence))
    length = tf.cast(tf.reduce_sum(populated, axis=1), tf.int32)
    mask = tf.cast(tf.expand_dims(populated, -1), tf.float32)
    return length, mask


def reader(inputs, lengths, output_size, contexts=(None, None), scope=None):
    """Dynamic bi-LSTM reader; can be conditioned with initial state of other rnn.
    Args:
        inputs (tensor): The inputs into the bi-LSTM
        lengths (tensor): The lengths of the sequences
        output_size (int): Size of the LSTM state of the reader.
        context (tensor=None, tensor=None): Tuple of initial (forward, backward) states
                                  for the LSTM
        scope (string): The TensorFlow scope for the reader.
        drop_keep_drop (float=1.0): The keep probability for dropout.
    Returns:
        Outputs (tensor): The outputs from the bi-LSTM.
        States (tensor): The cell states from the bi-LSTM.
    """
    with tf.variable_scope(scope or "reader") as varscope:
        cell_fw = tf.contrib.rnn.LSTMCell(output_size, initializer=tf.contrib.layers.xavier_initializer())
        cell_bw = tf.contrib.rnn.LSTMCell(output_size, initializer=tf.contrib.layers.xavier_initializer())
        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw,
            cell_bw,
            inputs,
            sequence_length=lengths,
            initial_state_fw=contexts[0],
            initial_state_bw=contexts[1],
            dtype=tf.float32
        )

        return outputs, states


def biLSTM(inputs, dim, seq_len, name):
    """
    A Bi-Directional LSTM layer. Returns forward and backward hidden states as a tuple, and cell states as a tuple.

    Ouput of hidden states: [(batch_size, max_seq_length, hidden_dim), (batch_size, max_seq_length, hidden_dim)]
    Same shape for cell states.
    """
    with tf.name_scope(name):
        with tf.variable_scope('forward' + name):
            lstm_fwd = tf.contrib.rnn.LSTMCell(num_units=dim)
        with tf.variable_scope('backward' + name):
            lstm_bwd = tf.contrib.rnn.LSTMCell(num_units=dim)

        hidden_states, cell_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fwd, cell_bw=lstm_bwd, inputs=inputs, sequence_length=seq_len, dtype=tf.float32, scope=name)

    return hidden_states, cell_states


def LSTM(inputs, dim, seq_len, name):
    """
    An LSTM layer. Returns hidden states and cell states as a tuple.

    Ouput shape of hidden states: (batch_size, max_seq_length, hidden_dim)
    Same shape for cell states.
    """
    with tf.name_scope(name):
        cell = tf.contrib.rnn.LSTMCell(num_units=dim)
        hidden_states, cell_states = tf.nn.dynamic_rnn(cell, inputs=inputs, sequence_length=seq_len, dtype=tf.float32, scope=name)

    return hidden_states, cell_states


def last_output(output, true_length):
    """
    To get the last hidden layer form a dynamically unrolled RNN.
    Input of shape (batch_size, max_seq_length, hidden_dim).

    true_length: Tensor of shape (batch_size). Such a tensor is given by the length() function.
    Output of shape (batch_size, hidden_dim).
    """
    max_length = int(output.get_shape()[1])
    length_mask = tf.expand_dims(tf.one_hot(true_length-1, max_length, on_value=1., off_value=0.), -1)
    last_output = tf.reduce_sum(tf.multiply(output, length_mask), 1)
    return last_output


def masked_softmax(scores, mask):
    """
    Used to calculcate a softmax score with true sequence length (without padding), rather than max-sequence length.

    Input shape: (batch_size, max_seq_length, hidden_dim). 
    mask parameter: Tensor of shape (batch_size, max_seq_length). Such a mask is given by the length() function.
    """
    numerator = tf.exp(tf.subtract(scores, tf.reduce_max(scores, 1, keepdims=True))) * mask
    denominator = tf.reduce_sum(numerator, 1, keep_dims=True)
    weights = tf.div(numerator, denominator)
    return weights


In [0]:
# models/msim
class msim(object):
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train, description_num):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length
        self.description_num = description_num

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 2], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([2], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = length(self.premise_x)
        hyp_seq_lengths, mask_hyp = length(self.hypothesis_x)


        ### First cbiLSTM layer ###
        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        hypothesis_outs, c2 = biLSTM(hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis')
        # calculate premise based on the condition of hypothesis
        with tf.variable_scope("conditional_first_premise_layer") as fstPremise_scope:
            premise_outs, c1 = reader(premise_in, prem_seq_lengths, self.dim, c2, scope=fstPremise_scope)

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)


        ### Attention ###

        scores_all = []
        premise_attn = []
        alphas = []

        for i in range(self.sequence_length):

            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(premise_list[i], hypothesis_list[j]), 1, keep_dims=True)
                scores_i_list.append(score_ij)

            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = masked_softmax(scores_i, mask_hyp)
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1)
            premise_attn.append(a_tilde_i)

            scores_all.append(scores_i)
            alphas.append(alpha_i)

        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack, axis=1)

        hypothesis_attn = []
        betas = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            betas.append(beta_j)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1) # (?, 50, 600)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1) # (?, 50, 600)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2) # (?, 50, 50, 1)
        self.beta_s = tf.stack(betas, axis=2) # (?, 50, 50, 1)


        ### Subcomponent Inference ###

        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2)
        m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul], 2) 

        ### Inference Composition ###

        v2_outs, c4 = biLSTM(m_b, dim=self.dim, seq_len=hyp_seq_lengths, name='v2') # hypothesis
        # same to hypothesis premise part, calculate v1 based on v2 during Inference Composition
        with tf.variable_scope("conditional_inference_composition-v1") as v1_scope:
            v1_outs, c3 = reader(m_a, prem_seq_lengths, self.dim, c4, scope=v1_scope) # premise

        v1_bi = tf.concat(v1_outs, axis=2) # (?, 50, 600)
        v2_bi = tf.concat(v2_outs, axis=2) # (?, 50, 600)


        ### Pooling Layer ###
        v_1_sum = tf.reduce_sum(v1_bi, 1) # 整列求和 (?, 600) 把每句话的50个单词省略了?
        v_1_ave = tf.div(v_1_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1)) # (?, 600)

        v_2_sum = tf.reduce_sum(v2_bi, 1) # 整列求和 (?, 600)
        v_2_ave = tf.div(v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1)) # (?, 600)

        v_1_max = tf.reduce_max(v1_bi, 1) # 整列求和 (?, 600)
        v_2_max = tf.reduce_max(v2_bi, 1) # 整列求和 (?, 600)


        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

        # MLP layer
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)
        h_fold_mlp = tf.reshape(h_mlp, [-1, self.description_num, self.dim])
        h_mean_mlp = tf.reduce_mean(h_fold_mlp, 1)

        # Dropout applied to classifier
        h_drop = tf.nn.dropout(h_mean_mlp, self.keep_rate_ph)

        # Get prediction
        self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl


        # Define the cost function
        self.total_cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits)) # 一个数字


In [0]:
# data_processing functions

def tokenize(string):
    string = re.sub(r'\(|\)', '', string)
    return string.split()
  
def loadEmbedding_rand(path, word_indices):
    """
    Load GloVe embeddings. Doing a random normal initialization for OOV words.
    """
    n = len(word_indices)
    m = 300
    emb = np.empty((n, m), dtype=np.float32)

    emb[:,:] = np.random.normal(size=(n,m))

    # Explicitly assign embedding of <PAD> to be zeros.
    emb[0:2, :] = np.zeros((1,m), dtype="float32")
    
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if 300 != None:
                if i >= 300:
                    break
            
            s = line.split()
            if s[0] in word_indices:
                if len(s) > 301:
                    tail = s[len(s)-300:]                
                    head = [s[0]]
                    s = head + tail
                    # print(head)
                emb[word_indices[s[0]], :] = np.asarray(s[1:]) 
    return emb

def load_nli_data(path):
    """
    Load MultiNLI or SNLI data.
    If the "snli" parameter is set to True, a genre label of snli will be assigned to the data. 
    """
    data = []
    with open(path) as f:
        for line in f:
            loaded_example = json.loads(line)
            if loaded_example["gold_label"] not in NLI_LABEL_MAP:
                continue
            loaded_example["label"] = NLI_LABEL_MAP[loaded_example["gold_label"]]
            data.append(loaded_example)
        random.seed(1)
        random.shuffle(data)
    return data

  
  
def load_uwre_data(path):
    """
    Load UWRE data.
    "uwre" is set to "genre". 
    """
    data = []
    with open(path) as f:
        for line in f:
            loaded_example = json.loads(line)
            if loaded_example["gold_label"] not in LABEL_MAP:
                continue
            loaded_example["label"] = LABEL_MAP[loaded_example["gold_label"]]
            loaded_example["genre"] = "uwre"
            data.append(loaded_example)
        random.seed(1)
        random.shuffle(data)
    return data

def build_uwre_dictionary(uwre_training_datasets, relation_descriptions):
    """
    Extract vocabulary and build dictionary.
    """
    word_counter = collections.Counter()
    for i, dataset in enumerate(uwre_training_datasets):
        for example in dataset:
            word_counter.update(tokenize(example['sentence']))
    
    for relation in relation_descriptions:
        for description in relation_descriptions[relation]:
            word_counter.update(tokenize(description))
    
    vocabulary = set([word for word in word_counter])
    vocabulary = list(vocabulary)
    vocabulary = [PADDING, UNKNOWN] + vocabulary

    word_indices = dict(zip(vocabulary, range(len(vocabulary))))

    return word_indices
  
def build_dictionary(multi_nli_datasets, uwre_training_datasets, relation_descriptions):
    """
    Extract vocabulary and build dictionary.
    """
    word_counter = collections.Counter()
    for i, dataset in enumerate(multi_nli_datasets):
        for example in dataset:
            word_counter.update(tokenize(example['sentence1_binary_parse']))
            word_counter.update(tokenize(example['sentence2_binary_parse']))
            
    for i, dataset in enumerate(uwre_training_datasets):
        for example in dataset:
            word_counter.update(tokenize(example['sentence']))
    
    for relation in relation_descriptions:
        for description in relation_descriptions[relation]:
            word_counter.update(tokenize(description))
    
    vocabulary = set([word for word in word_counter])
    vocabulary = list(vocabulary)
    vocabulary = [PADDING, UNKNOWN] + vocabulary

    word_indices = dict(zip(vocabulary, range(len(vocabulary))))

    return word_indices
  
def uwre_sentences_to_padded_index_sequences(word_indices, datasets):
    """
    Annotate datasets with feature vectors. Adding right-sided padding.
    """
    for i, dataset in enumerate(datasets):
        for example in dataset:
            example['sentence' + '_index_sequence'] = np.zeros((FIXED_PARAMETERS["seq_length"]), dtype=np.int32)

            token_sequence = tokenize(example['sentence'])
            padding = FIXED_PARAMETERS["seq_length"] - len(token_sequence)

            for i in range(FIXED_PARAMETERS["seq_length"]):
                if i >= len(token_sequence):
                    index = word_indices[PADDING]
                else:
                    if token_sequence[i] in word_indices:
                        index = word_indices[token_sequence[i]]
                    else:
                        index = word_indices[UNKNOWN]
                example['sentence' + '_index_sequence'][i] = index

def descriptions_to_padded_index_sequences(word_indices, relation_descriptions):
    padded_relation_descriptions = {}

    for relation in relation_descriptions:
        descriptions = np.zeros((len(relation_descriptions[relation]), FIXED_PARAMETERS["seq_length"]), dtype=np.int32)
        for j,description in enumerate(relation_descriptions[relation]):
            token_sequence = tokenize(description)
            padding = FIXED_PARAMETERS["seq_length"] - len(token_sequence)

            for i in range(FIXED_PARAMETERS["seq_length"]):
                if i >= len(token_sequence):
                    index = word_indices[PADDING]
                else:
                    if token_sequence[i] in word_indices:
                        index = word_indices[token_sequence[i]]
                    else:
                        index = word_indices[unknown]
                descriptions[j][i] = index
        padded_relation_descriptions[relation] = descriptions
    return padded_relation_descriptions
  

def sentences_to_padded_index_sequences(word_indices, datasets):
    """
    Annotate datasets with feature vectors. Adding right-sided padding. 
    """
    for i, dataset in enumerate(datasets):
        for example in dataset:
            for sentence in ['sentence1_binary_parse', 'sentence2_binary_parse']:
                example[sentence + '_index_sequence'] = np.zeros((FIXED_PARAMETERS["seq_length"]), dtype=np.int32)

                token_sequence = tokenize(example[sentence])
                padding = FIXED_PARAMETERS["seq_length"] - len(token_sequence)

                for i in range(FIXED_PARAMETERS["seq_length"]):
                    if i >= len(token_sequence):
                        index = word_indices[PADDING]
                    else:
                        if token_sequence[i] in word_indices:
                            index = word_indices[token_sequence[i]]
                        else:
                            index = word_indices[UNKNOWN]
                    example[sentence + '_index_sequence'][i] = index



In [0]:
training_mnli = load_nli_data(FIXED_PARAMETERS["training_mnli"])
dev_matched = load_nli_data(FIXED_PARAMETERS["dev_matched"])
dev_mismatched = load_nli_data(FIXED_PARAMETERS["dev_mismatched"])
# training_uwre = load_uwre_data(FIXED_PARAMETERS["training_uwre"])
dev_uwre = load_uwre_data(FIXED_PARAMETERS["dev_uwre"])
# test_uwre = load_uwre_data(FIXED_PARAMETERS["test_uwre"])


In [0]:
with open(the_path + "extended_relation_descriptions.json", 'r') as file:
    relation_descriptions = json.load(file)

In [0]:
dictpath = os.path.join(FIXED_PARAMETERS["log_path"], 'dictionary') + ".p"

if not os.path.isfile(dictpath):
  print("create new dictionary")
  word_indices = build_dictionary([training_mnli], [training_uwre], relation_descriptions)
  pickle.dump(word_indices, open(dictpath, "wb"))
else:
  print("load exist dictionary")
  word_indices = pickle.load(open(dictpath, "rb"))

load exist dictionary


In [0]:
sentences_to_padded_index_sequences(word_indices, [training_mnli,dev_matched, dev_mismatched])
uwre_sentences_to_padded_index_sequences(word_indices, [dev_uwre])
padded_relation_descriptions = descriptions_to_padded_index_sequences(word_indices, relation_descriptions)
loaded_embeddings = loadEmbedding_rand(FIXED_PARAMETERS["embedding_data_path"], word_indices)

In [0]:
# class model classifier

learning_rate =  FIXED_PARAMETERS["learning_rate"]
display_epoch_freq = 1
display_step_freq = 50
embedding_dim = FIXED_PARAMETERS["word_embedding_dim"]
dim = FIXED_PARAMETERS["hidden_embedding_dim"]
batch_size = FIXED_PARAMETERS["batch_size"]
emb_train = FIXED_PARAMETERS["emb_train"]
keep_rate = FIXED_PARAMETERS["keep_rate"]
sequence_length = FIXED_PARAMETERS["seq_length"]
description_num = int(FIXED_PARAMETERS["description_num"])


In [0]:
logger.Log("Building model from %s.py" %(FIXED_PARAMETERS["model_type"]))

[1] Building model from mlp_esim.py


In [0]:
my_model = msim(seq_length=sequence_length, emb_dim=embedding_dim,hidden_dim=dim, embeddings=loaded_embeddings,emb_train=emb_train, description_num=description_num)
optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999).minimize(my_model.total_cost)

W0727 19:37:39.366694 139950971316096 deprecation.py:506] From <ipython-input-7-04d88b008a3e>:27: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0727 19:37:40.378069 139950971316096 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0727 19:37:40.379876 139950971316096 deprecation.py:323] From <ipython-input-6-af3a5211d4f8>:66: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Th

In [0]:
logger.Log("Initializing variables")
init = tf.global_variables_initializer()
sess = None
saver = tf.train.Saver()

[1] Initializing variables


In [0]:
def get_uwre_minibatch(dataset, start_index, end_index):
    indices = range(start_index, end_index)
    premise_list = []
    hypothesis_list = []

    for i in indices:
        relation = dataset[i]['relation']

        premise_instance = dataset[i]['sentence_index_sequence']
        premise = [premise_instance] * description_num
        premise_list.append(premise)

        hypothesis_len = len(padded_relation_descriptions[relation])
        hypothesis_ind = np.random.choice(hypothesis_len, description_num, replace=False)
        hypothesis = padded_relation_descriptions[relation][hypothesis_ind]
        hypothesis_list.append(hypothesis)

    premise_vectors = np.vstack(premise_list)
    hypothesis_vectors = np.vstack(hypothesis_list)
    genres = [dataset[i]['genre'] for i in indices]
    labels = [dataset[i]['label'] for i in indices]

    return premise_vectors, hypothesis_vectors, labels, genres

def get_minibatch(dataset, start_index, end_index):
    indices = range(start_index, end_index)
    premise_vectors = np.vstack([[dataset[i]['sentence1_binary_parse_index_sequence']] * description_num for i in indices])
    hypothesis_vectors = np.vstack([[dataset[i]['sentence2_binary_parse_index_sequence']] * description_num for i in indices])
    genres = [dataset[i]['genre'] for i in indices]
    labels = [dataset[i]['label'] for i in indices]
    return premise_vectors, hypothesis_vectors, labels, genres

def restore(sess, best=True):
    if True:
        path = os.path.join(FIXED_PARAMETERS["ckpt_path"], modname) + ".ckpt_best"
    else:
        path = os.path.join(FIXED_PARAMETERS["ckpt_path"], modname) + ".ckpt"
    sess = tf.Session()
    sess.run(init)
    saver.restore(sess, path)
    logger.Log("Model restored from file: %s" % path)

def uwre_classify(model, sess, examples):
    # This classifies a list of examples
    total_batch = int(len(examples) / batch_size)
    logits = np.empty(2)
    genres = []
    for i in range(total_batch):
        minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, minibatch_genres = get_uwre_minibatch(examples, batch_size * i, batch_size * (i + 1))
        feed_dict = {model.premise_x: minibatch_premise_vectors,
                            model.hypothesis_x: minibatch_hypothesis_vectors,
                            model.y: minibatch_labels,
                            model.keep_rate_ph: 1.0}
        genres += minibatch_genres
        logit, cost = sess.run([model.logits, model.total_cost], feed_dict)
        logits = np.vstack([logits, logit])
    return genres, np.argmax(logits[1:], axis=1), cost

def classify(model, sess, examples):
    # This classifies a list of examples
    total_batch = int(len(examples) / batch_size)
    logits = np.empty(2)
    genres = []
    for i in range(total_batch):
        minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, minibatch_genres = get_minibatch(examples,
                                batch_size * i, batch_size * (i + 1))
        feed_dict = {model.premise_x: minibatch_premise_vectors,
                            model.hypothesis_x: minibatch_hypothesis_vectors,
                            model.y: minibatch_labels,
                            model.keep_rate_ph: 1.0}
        genres += minibatch_genres
        logit, cost = sess.run([model.logits, model.total_cost], feed_dict)
        logits = np.vstack([logits, logit])

    return genres, np.argmax(logits[1:], axis=1), cost
  
def evaluate_classifier(classifier, model, sess, eval_set, batch_size):
    """
    Function to get accuracy and cost of the model, evaluated on a chosen dataset.

    classifier: the model's classfier, it should return genres, logit values, and cost for a given minibatch of the evaluation dataset
    eval_set: the chosen evaluation set, for eg. the dev-set
    batch_size: the size of minibatches.
    """
    correct = 0
    genres, hypotheses, cost = classifier(model, sess, eval_set)
    cost = cost / batch_size
    full_batch = int(len(eval_set) / batch_size) * batch_size
    for i in range(full_batch):
        hypothesis = hypotheses[i]
        if hypothesis == eval_set[i]['label']:
            correct += 1
    return correct / float(len(eval_set)), cost


In [0]:
sess = tf.Session()
sess.run(init)

step = 0
epoch = 0
best_dev_mat = 0.
best_mtrain_acc = 0.
last_train_acc = [.001, .001, .001, .001, .001]
best_step = 0

dev_mat = dev_matched
dev_mismat = dev_mismatched

# Restore most recent checkpoint if it exists. 
# Also restore values for best dev-set accuracy and best training-set accuracy
ckpt_file = os.path.join(FIXED_PARAMETERS["ckpt_path"], modname) + ".ckpt"
if os.path.isfile(ckpt_file + ".meta"):
    if os.path.isfile(ckpt_file + "_best.meta"):
        saver.restore(sess, (ckpt_file + "_best"))
        best_dev_mat, dev_cost_mat = evaluate_classifier(classify, my_model, sess, dev_mat, batch_size)
        best_dev_mismat, dev_cost_mismat = evaluate_classifier(classify, my_model, sess, dev_mismat, batch_size)
        best_mtrain_acc, mtrain_cost = evaluate_classifier(classify, my_model, sess, training_mnli[0:5000], batch_size)
        logger.Log("Restored best matched-dev acc: %f\n Restored best mismatched-dev acc: %f\n \
                   Restored best MulitNLI train acc: %f" %(best_dev_mat, best_dev_mismat, best_mtrain_acc))


    saver.restore(sess, ckpt_file)
    logger.Log("Model restored from file: %s" % ckpt_file)

# Combine MultiNLI and SNLI data. Alpha has a default value of 0, if we want to use SNLI data, it must be passed as an argument.

### Training cycle
logger.Log("Start Training...")


W0727 19:39:35.955064 139950971316096 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
[1] Restored best matched-dev acc: 0.685894
 Restored best mismatched-dev acc: 0.695808
                    Restored best MulitNLI train acc: 0.689200
[1] Model restored from file: /content/gdrive/My Drive/master_thesis/model/pretrain_mlp_15.ckpt
[1] Start Training...


In [0]:
while True:
    training_data = training_mnli
    random.shuffle(training_data)
    avg_cost = 0.
    total_batch = int(len(training_data) / batch_size)

    # Loop over all batches in epoch
    for i in range(total_batch):
        # Assemble a minibatch of the next B examples
        minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, minibatch_genres = get_minibatch(
            training_data, batch_size * i, batch_size * (i + 1))

        # Run the optimizer to take a gradient step, and also fetch the value of the 
        # cost function for logging
        feed_dict = {my_model.premise_x: minibatch_premise_vectors,
                        my_model.hypothesis_x: minibatch_hypothesis_vectors,
                        my_model.y: minibatch_labels,
                        my_model.keep_rate_ph: keep_rate}
        _, c = sess.run([optimizer, my_model.total_cost], feed_dict)

        # Since a single epoch can take a  ages for larger models (ESIM),
        # we'll print  accuracy every 50 steps
        if step % display_step_freq == 0:
            dev_acc_mat, dev_cost_mat = evaluate_classifier(classify, my_model, sess, dev_mat, batch_size)
            dev_acc_mismat, dev_cost_mismat = evaluate_classifier(classify, my_model, sess, dev_mismat, batch_size)

            dev_acc_uwre, dev_cost_uwre = evaluate_classifier(uwre_classify, my_model, sess, dev_uwre, batch_size)

            mtrain_acc, mtrain_cost = evaluate_classifier(classify, my_model, sess, training_mnli[0:5000], batch_size)

            logger.Log("Step: %i\t uwre dev acc: %f\t uwre dev cost %f\t" \
                    %(step, dev_acc_uwre, dev_cost_uwre))
            logger.Log("Step: %i\t Dev-matched acc: %f\t Dev-mismatched acc: %f\t \
                    MultiNLI train acc: %f" %(step, dev_acc_mat,
                        dev_acc_mismat, mtrain_acc))
            logger.Log("Step: %i\t Dev-matched cost: %f\t Dev-mismatched cost: %f\t \
                    MultiNLI train cost: %f" %(step, dev_cost_mat,
                        dev_cost_mismat, mtrain_cost))

        if step % 500 == 0:
            saver.save(sess, ckpt_file)
            best_test = 100 * (1 - best_dev_mat / dev_acc_mat)
            if best_test > 0.04:
                saver.save(sess, ckpt_file + "_best")
                best_dev_mat = dev_acc_mat
                best_mtrain_acc = mtrain_acc
                best_step = step
                logger.Log("Checkpointing with new best matched-dev accuracy: %f" %(best_dev_mat))

        step += 1

        # Compute average loss
        avg_cost += c / (total_batch * batch_size)

    # Display some statistics about the epoch
    if epoch % display_epoch_freq == 0:
        logger.Log("Epoch: %i\t Avg. Cost: %f" %(epoch+1, avg_cost))

    epoch += 1
    last_train_acc[(epoch % 5) - 1] = mtrain_acc

    # Early stopping
    progress = 1000 * (sum(last_train_acc)/(5 * min(last_train_acc)) - 1)

    if (progress < 0.1) or (step > best_step + 30000):
        logger.Log("Best matched-dev accuracy: %s" %(best_dev_mat))
        logger.Log("MultiNLI Train accuracy: %s" %(best_mtrain_acc))
        completed = True
        break


[1] Step: 0	 uwre dev acc: 0.492437	 uwre dev cost 0.044546	
[1] Step: 0	 Dev-matched acc: 0.688434	 Dev-mismatched acc: 0.696255	                     MultiNLI train acc: 0.701800
[1] Step: 0	 Dev-matched cost: 0.031693	 Dev-mismatched cost: 0.032678	                     MultiNLI train cost: 0.039820
[1] Checkpointing with new best matched-dev accuracy: 0.688434
[1] Step: 50	 uwre dev acc: 0.484034	 uwre dev cost 0.048499	
[1] Step: 50	 Dev-matched acc: 0.665571	 Dev-mismatched acc: 0.681486	                     MultiNLI train acc: 0.727400
[1] Step: 50	 Dev-matched cost: 0.040901	 Dev-mismatched cost: 0.028750	                     MultiNLI train cost: 0.043138
[1] Step: 100	 uwre dev acc: 0.490756	 uwre dev cost 0.065204	
[1] Step: 100	 Dev-matched acc: 0.643156	 Dev-mismatched acc: 0.643742	                     MultiNLI train acc: 0.726600
[1] Step: 100	 Dev-matched cost: 0.083094	 Dev-mismatched cost: 0.028649	                     MultiNLI train cost: 0.051149
[1] Step: 150	 uwre de

In [0]:
saver.save(sess, ckpt_file + "_best")