# Zero shot sequence labelling
Bidirectional LSTM with attention

https://github.com/marekrei/sequence-labeler

In [1]:
import tensorflow as tf
import numpy
import collections
import re
import configparser

## 1. Configuration Setup

### 1.1 Parse configuration file using python's inbuilt `configparser`

In [2]:
def parse_config(config_section, config_path):
    """
    Reads configuration from the file and returns a dictionary.
    Tries to guess the correct datatype for each of the config values.
    """
    config_parser = configparser.ConfigParser(allow_no_value=True)
    config_parser.read(config_path)
    config = collections.OrderedDict()
    for key, value in config_parser.items(config_section):
        if value is None or len(value.strip()) == 0:
            config[key] = None
        elif value.lower() in ["true", "false"]:
            config[key] = config_parser.getboolean(config_section, key)
        elif value.isdigit():
            config[key] = config_parser.getint(config_section, key)
        elif is_float(value):
            config[key] = config_parser.getfloat(config_section, key)
        else:
            config[key] = config_parser.get(config_section, key)
    return config

def is_float(value):
    """
    Check in value is of type float()
    """
    try:
        float(value)
        return True
    except ValueError:
        return False

In [3]:
config = parse_config("config", 'fcepublic.conf')
# config

# 2. Read Input Data and do preprocessing

### 2.1 Read input data

In [4]:
def read_input_files(file_paths, max_sentence_length=-1):
    """
    Reads input files in whitespace-separated format.
    Will split file_paths on comma, reading from multiple files.
    The format assumes the first column is the word, the last column is the label.
    """
    sentences = []
    line_length = None
    for file_path in file_paths.strip().split(","):
        with open(file_path, "r") as f:
            sentence = []
            for line in f:
                line = line.strip()
                if len(line) > 0:
                    line_parts = line.split()
                    assert(len(line_parts) >= 2)
                    assert(len(line_parts) == line_length or line_length == None)
                    line_length = len(line_parts)
                    sentence.append(line_parts)
                elif len(line) == 0 and len(sentence) > 0:
                    if max_sentence_length <= 0 or len(sentence) <= max_sentence_length:
                        sentences.append(sentence)
                    sentence = []
            if len(sentence) > 0:
                if max_sentence_length <= 0 or len(sentence) <= max_sentence_length:
                    sentences.append(sentence)
    return sentences

data_train, data_dev, data_test = None, None, None
if config["path_train"] != None and len(config["path_train"]) > 0:
    data_train = read_input_files(config["path_train"], config["max_train_sent_length"])
if config["path_dev"] != None and len(config["path_dev"]) > 0:
    data_dev = read_input_files(config["path_dev"])
if config["path_test"] != None and len(config["path_test"]) > 0:
    data_test = []
    for path_test in config["path_test"].strip().split(":"):
        data_test += read_input_files(path_test)

View the raw data

In [5]:
print('data_train:\n-----------\nNumber of sentences: {}\nSample sentences:\n{}\n\n{}\n\n{}'
      .format(len(data_train), data_train[0], data_train[7], data_train[50]))

data_train:
-----------
Number of sentences: 28731
Sample sentences:
[['Dear', 'c'], ['Sir', 'c'], ['or', 'c'], ['Madam', 'c'], [',', 'c']]

[['You', 'c'], ['promised', 'c'], ['a', 'c'], ['perfect', 'c'], ['evening', 'c'], ['but', 'c'], ['it', 'c'], ['became', 'c'], ['a', 'c'], ['big', 'c'], ['disastrous', 'i'], ['!', 'c']]

[['If', 'c'], ['weather', 'i'], ['is', 'c'], ['hot', 'c'], ['then', 'c'], ['we', 'c'], ['do', 'c'], ["n't", 'c'], ['have', 'c'], ['to', 'c'], ['wear', 'c'], ['under', 'i'], ['wear', 'i'], ['because', 'c'], ['very', 'c'], ['thin', 'c'], ['and', 'c'], ['light', 'c'], ['clothes', 'c'], ['will', 'c'], ['support', 'c'], ['our', 'c'], ['bodies', 'c'], ['.', 'c']]


In [6]:
data_train

[[['Dear', 'c'], ['Sir', 'c'], ['or', 'c'], ['Madam', 'c'], [',', 'c']],
 [['I', 'c'],
  ['am', 'c'],
  ['writing', 'c'],
  ['in', 'c'],
  ['order', 'c'],
  ['to', 'c'],
  ['express', 'c'],
  ['my', 'c'],
  ['disappointment', 'c'],
  ['about', 'i'],
  ['your', 'c'],
  ['musical', 'c'],
  ['show', 'c'],
  ['"', 'c'],
  ['Over', 'c'],
  ['the', 'c'],
  ['Rainbow', 'c'],
  ['"', 'c'],
  ['.', 'c']],
 [['I', 'c'],
  ['saws', 'i'],
  ['the', 'c'],
  ['show', 'c'],
  ["'s", 'c'],
  ['advertisement', 'c'],
  ['hanging', 'c'],
  ['up', 'c'],
  ['of', 'i'],
  ['a', 'c'],
  ['wall', 'c'],
  ['in', 'c'],
  ['London', 'c'],
  ['where', 'c'],
  ['I', 'c'],
  ['was', 'c'],
  ['spending', 'c'],
  ['my', 'c'],
  ['holiday', 'c'],
  ['with', 'c'],
  ['some', 'c'],
  ['friends', 'c'],
  ['.', 'c']],
 [['I', 'c'],
  ['convinced', 'c'],
  ['them', 'c'],
  ['to', 'c'],
  ['go', 'c'],
  ['there', 'c'],
  ['with', 'c'],
  ['me', 'c'],
  ['because', 'c'],
  ['I', 'c'],
  ['had', 'c'],
  ['heard', 'c'],
  ['go

### 2.2 Convert data into batches

Converts the raw data in batches (default config = of equal size 32). Each batch contains the index of the data instead of the data itself.

In [7]:
def create_batches_of_sentence_ids(sentences, batch_equal_size, max_batch_size):
    """
    Groups together sentences into batches
    If batch_equal_size is True, make all sentences in a batch be equal length.
    If max_batch_size is positive, this value determines the maximum number of sentences in each batch.
    If max_batch_size has a negative value, the function dynamically creates the batches such that each batch contains abs(max_batch_size) words.
    Returns a list of lists with sentences ids.
    """
    batches_of_sentence_ids = []
    if batch_equal_size == True:
        # create an ordered dict of
        #   key : value = sentence length : [sentence1's id, sentence2's id, ...]
        sentence_ids_by_length = collections.OrderedDict()
        sentence_length_sum = 0.0
        for i in range(len(sentences)):
            length = len(sentences[i])
            if length not in sentence_ids_by_length:
                sentence_ids_by_length[length] = []
            sentence_ids_by_length[length].append(i)

        '''
        Create a list of sentence batches of the form
          [ [id1, id2, ...],
            [id?, id?, ...],
            ...
            [id?, id?, ...] ]
        where
          - each batch contains sentences of the SAME LENGTH
          - the id order determined by the order they were added to 'sentence_ids_by_length'
          - the length of each batch is determined by 'batch_size'
        '''
        for sentence_length in sentence_ids_by_length:
            if max_batch_size > 0:
                batch_size = max_batch_size
            else:
                batch_size = int((-1.0 * max_batch_size) / sentence_length)

            for i in range(0, len(sentence_ids_by_length[sentence_length]), batch_size):
                batches_of_sentence_ids.append(sentence_ids_by_length[sentence_length][i:i + batch_size])
    else:

        '''
        This is basically the same as previous, except that each sentence in each batch
        may NOT be of the same length
        '''
        current_batch = []
        max_sentence_length = 0
        for i in range(len(sentences)):
            current_batch.append(i)
            if len(sentences[i]) > max_sentence_length:
                max_sentence_length = len(sentences[i])
            if (max_batch_size > 0 and len(current_batch) >= max_batch_size) \
              or (max_batch_size <= 0 and len(current_batch)*max_sentence_length >= (-1 * max_batch_size)):
                batches_of_sentence_ids.append(current_batch)
                current_batch = []
                max_sentence_length = 0
        if len(current_batch) > 0:
            batches_of_sentence_ids.append(current_batch)
    return batches_of_sentence_ids

In [8]:
print('Current config:\nbatch_equal_size: {}\nmax_batch_size: {}'.format(config['batch_equal_size'], config['max_batch_size']))
batches_of_sentence_ids = create_batches_of_sentence_ids(data_train, config['batch_equal_size'], config['max_batch_size'])

Current config:
batch_equal_size: False
max_batch_size: 32


Example: Each batch will be of length 32, and contains the index of the sentence.

In [22]:
# First 3 example
for i in batches_of_sentence_ids[:3]:
    print('length: {}\n{}\n'.format(len(i), i))

length: 32
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]

length: 32
[32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]

length: 32
[64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95]



## 3. Core code in `labeler.py`

- `build_vocabs`
- `construct_network`

In [10]:
class SequenceLabeler(object):
    def __init__(self, config):
        self.config = config

        self.UNK = "<unk>"
        self.CUNK = "<cunk>"

        self.word2id = None
        self.char2id = None
        self.label2id = None
        self.singletons = None
    
    def build_vocabs(self, data_train, data_dev, data_test, embedding_path=None):
        data_source = list(data_train)
        if self.config["vocab_include_devtest"]:
            if data_dev != None:
                data_source += data_dev
            if data_test != None:
                data_source += data_test

        # Create "char_counter" dict containing key:value = char:count pairs
        char_counter = collections.Counter()
        for sentence in data_source:
            for word in sentence:
                char_counter.update(word[0])

        # Create character to index dictionary, ordered from most common to least common
        # Contains key:value = char:id pairs
        self.char2id = collections.OrderedDict([(self.CUNK, 0)])
        for char, count in char_counter.most_common():
            if char not in self.char2id:
                self.char2id[char] = len(self.char2id) # char = index

        # Create "word_counter" dict containing key:value = word:count pairs
        word_counter = collections.Counter()
        for sentence in data_source:
            for word in sentence:
                w = word[0]
                if self.config["lowercase"] == True:
                    w = w.lower()
                if self.config["replace_digits"] == True:
                    w = re.sub(r'\d', '0', w)
                word_counter[w] += 1

        # Create word to index dictionary, ordered from most common to least common
        # Contains key:value = char:id pairs
        self.word2id = collections.OrderedDict([(self.UNK, 0)])
        for word, count in word_counter.most_common():
            if self.config["min_word_freq"] <= 0 or count >= self.config["min_word_freq"]:
                if word not in self.word2id:
                    self.word2id[word] = len(self.word2id)

        self.singletons = set([word for word in word_counter if word_counter[word] == 1])

        label_counter = collections.Counter()
        for sentence in data_train: #this one only based on training data
            for word in sentence:
                label_counter[word[-1]] += 1
        self.label2id = collections.OrderedDict()
        for label, count in label_counter.most_common():
            if label not in self.label2id:
                self.label2id[label] = len(self.label2id)

        if embedding_path != None and self.config["vocab_only_embedded"] == True:
            self.embedding_vocab = set([self.UNK])
            with open(embedding_path, 'r') as f:
                for line in f:
                    line_parts = line.strip().split()
                    if len(line_parts) <= 2:
                        continue
                    w = line_parts[0]
                    if self.config["lowercase"] == True:
                        w = w.lower()
                    if self.config["replace_digits"] == True:
                        w = re.sub(r'\d', '0', w)
                    self.embedding_vocab.add(w)
            word2id_revised = collections.OrderedDict()
            for word in self.word2id:
                if word in embedding_vocab and word not in word2id_revised:
                    word2id_revised[word] = len(word2id_revised)
            self.word2id = word2id_revised

        print("n_words: " + str(len(self.word2id)))
        print("n_chars: " + str(len(self.char2id)))
        print("n_labels: " + str(len(self.label2id)))
        print("n_singletons: " + str(len(self.singletons)))
    
    def construct_network(self):

        # These will be populated with tensorflow API 'feed_dict'
        self.word_ids = tf.placeholder(tf.int32, [None, None], name="word_ids")
        self.char_ids = tf.placeholder(tf.int32, [None, None, None], name="char_ids")
        self.sentence_lengths = tf.placeholder(tf.int32, [None], name="sentence_lengths")
        self.word_lengths = tf.placeholder(tf.int32, [None, None], name="word_lengths")
        self.label_ids = tf.placeholder(tf.int32, [None, None], name="label_ids")
        self.learningrate = tf.placeholder(tf.float32, name="learningrate")
        self.is_training = tf.placeholder(tf.int32, name="is_training")
        print('self.word_ids: {}'.format(self.word_ids))

        self.loss = 0.0
        input_tensor = None
        input_vector_size = 0

        self.initializer = None
        if self.config["initializer"] == "normal":
            self.initializer = tf.random_normal_initializer(mean=0.0, stddev=0.1)
        elif self.config["initializer"] == "glorot":
            self.initializer = tf.glorot_uniform_initializer()
        elif self.config["initializer"] == "xavier":
            self.initializer = tf.glorot_normal_initializer()
        else:
            raise ValueError("Unknown initializer")

        # ---------------------
        # End of initialization
        # ---------------------

        '''
        This is the word embedding preloaded from GloVe, set at size 300, which can be fine-tuned
        during training, by setting trainable = True in the tensorflow API

          shape = [#unique words, #dim of word embedding] = [?, 300]
          initializer = self.initializr = tf.glorot_normal_initializer()
          trainable = True

        '''
        self.word_embeddings = tf.get_variable("word_embeddings",
            shape=[len(self.word2id), self.config["word_embedding_size"]],
            initializer=(tf.zeros_initializer() if self.config["emb_initial_zero"] == True else self.initializer),
            trainable=(True if self.config["train_embeddings"] == True else False))
        print('self.word_embeddings: {}'.format(self.word_embeddings))

        # This provides a way to lookup the word embeddings based on an input list of word indices
        # tf.nn.embedding_lookup does slicing similar to numpy matrix slicing
        # (https://stackoverflow.com/questions/34870614/what-does-tf-nn-embedding-lookup-function-do)
        input_tensor = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids)
        input_tensor = tf.Print(input_tensor, [tf.shape(input_tensor), input_tensor], 'input_tensor: ', summarize = 10)
        print('input_tensor: {}'.format(input_tensor))

        # 'word_embedding_size' = 300
        input_vector_size = self.config["word_embedding_size"]

        # 'char_embedding_size' = 100, 'char_recurrent_size' = 100
        if self.config["char_embedding_size"] > 0 and self.config["char_recurrent_size"] > 0:

            # Variables created here will be constrained within the scope 'chars'
            # instead of being shared globally among multiple tensorflow sessions.
            # Variables will be named "chars/...", "chars/..."
            with tf.variable_scope("chars"), tf.control_dependencies( [ tf.assert_equal(tf.shape(self.char_ids)[2], tf.reduce_max(self.word_lengths), message="Char dimensions don't match") ] ):

                # Similar to self.word_embeddings
                self.char_embeddings = tf.get_variable("char_embeddings",
                    shape=[len(self.char2id), self.config["char_embedding_size"]],
                    initializer=self.initializer,
                    trainable=True)

                # Similar to word input_tensor
                char_input_tensor = tf.nn.embedding_lookup(self.char_embeddings, self.char_ids)
                char_input_tensor = tf.Print(char_input_tensor, [tf.shape(char_input_tensor), char_input_tensor], 'char_input_tensor: ', summarize = 10)
                print('char_input_tensor: {}'.format(char_input_tensor))

                s = tf.shape(char_input_tensor)
                char_input_tensor = tf.reshape(char_input_tensor, shape=[s[0]*s[1], s[2], self.config["char_embedding_size"]])
                _word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])
                char_input_tensor = tf.Print(char_input_tensor, [tf.shape(char_input_tensor), char_input_tensor], 'char_input_tensor: ', summarize = 10)
                print('char_input_tensor: {}'.format(char_input_tensor))

                char_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell(self.config["char_recurrent_size"],
                    use_peepholes=self.config["lstm_use_peepholes"],
                    state_is_tuple=True,
                    initializer=self.initializer,
                    reuse=False)
                char_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell(self.config["char_recurrent_size"],
                    use_peepholes=self.config["lstm_use_peepholes"],
                    state_is_tuple=True,
                    initializer=self.initializer,
                    reuse=False)

                char_lstm_outputs = tf.nn.bidirectional_dynamic_rnn(char_lstm_cell_fw, char_lstm_cell_bw, char_input_tensor, sequence_length=_word_lengths, dtype=tf.float32, time_major=False)

                # Seems like only char_output_fw and char_output_bw are important
                _, ((_, char_output_fw), (_, char_output_bw)) = char_lstm_outputs
                char_output_tensor = tf.concat([char_output_fw, char_output_bw], axis=-1)
                char_output_tensor = tf.Print(char_output_tensor, [tf.shape(char_output_tensor), char_output_tensor], 'char_output_tensor: ')
                print('char_ouput_tensor: {}'.format(char_output_tensor))

                char_output_tensor = tf.reshape(char_output_tensor, shape=[s[0], s[1], 2 * self.config["char_recurrent_size"]])
                char_output_tensor = tf.Print(char_output_tensor, [tf.shape(char_output_tensor), char_output_tensor], 'char_output_tensor: ')
                print('char_ouput_tensor: {}'.format(char_output_tensor))

                char_output_vector_size = 2 * self.config["char_recurrent_size"]
                char_output_tensor = tf.Print(char_output_tensor, [tf.shape(char_output_tensor), char_output_tensor], 'char_output_tensor: ')
                print('char_ouput_tensor: {}'.format(char_output_tensor))

                if self.config["lmcost_char_gamma"] > 0.0:
                    self.loss += self.config["lmcost_char_gamma"] * self.construct_lmcost(char_output_tensor, char_output_tensor, self.sentence_lengths, self.word_ids, "separate", "lmcost_char_separate")
                if self.config["lmcost_joint_char_gamma"] > 0.0:
                    self.loss += self.config["lmcost_joint_char_gamma"] * self.construct_lmcost(char_output_tensor, char_output_tensor, self.sentence_lengths, self.word_ids, "joint", "lmcost_char_joint")

                # 'char_hidden_layer_size' = 50
                # 'char_integration_method' = 'concat'
                if self.config["char_hidden_layer_size"] > 0:

                    # char_hidden_layer_size = 50
                    char_hidden_layer_size = self.config["word_embedding_size"] if self.config["char_integration_method"] == "attention" else self.config["char_hidden_layer_size"]

                    # input = char_output_tensor
                    # output dim = char_hidden_layer_size
                    char_output_tensor = tf.layers.dense(char_output_tensor, char_hidden_layer_size, activation=tf.tanh, kernel_initializer=self.initializer)
                    char_output_tensor = tf.Print(char_output_tensor, [tf.shape(char_output_tensor), char_output_tensor], 'char_output_tensor: ')
                    print('char_ouput_tensor: {}'.format(char_output_tensor))

                    char_output_vector_size = char_hidden_layer_size

                if self.config["char_integration_method"] == "concat":

                    # combines character and word embeddings by concatenation
                    input_tensor = tf.concat([input_tensor, char_output_tensor], axis=-1)
                    input_tensor = tf.Print(input_tensor, [tf.shape(input_tensor), input_tensor], 'input_tensor: ')
                    print('input_tensor: {}'.format(input_tensor))

                    input_vector_size += char_output_vector_size

                elif self.config["char_integration_method"] == "attention":
                    assert(char_output_vector_size == self.config["word_embedding_size"]), "This method requires the char representation to have the same size as word embeddings"
                    static_input_tensor = tf.stop_gradient(input_tensor)
                    is_unk = tf.equal(self.word_ids, self.word2id[self.UNK])
                    char_output_tensor_normalised = tf.nn.l2_normalize(char_output_tensor, 2)
                    static_input_tensor_normalised = tf.nn.l2_normalize(static_input_tensor, 2)
                    cosine_cost = 1.0 - tf.reduce_sum(tf.multiply(char_output_tensor_normalised, static_input_tensor_normalised), axis=2)
                    is_padding = tf.logical_not(tf.sequence_mask(self.sentence_lengths, maxlen=tf.shape(self.word_ids)[1]))
                    cosine_cost_unk = tf.where(tf.logical_or(is_unk, is_padding), x=tf.zeros_like(cosine_cost), y=cosine_cost)
                    self.loss += self.config["char_attention_cosine_cost"] * tf.reduce_sum(cosine_cost_unk)
                    attention_evidence_tensor = tf.concat([input_tensor, char_output_tensor], axis=2)
                    attention_output = tf.layers.dense(attention_evidence_tensor, self.config["word_embedding_size"], activation=tf.tanh, kernel_initializer=self.initializer)
                    attention_output = tf.layers.dense(attention_output, self.config["word_embedding_size"], activation=tf.sigmoid, kernel_initializer=self.initializer)
                    input_tensor = tf.multiply(input_tensor, attention_output) + tf.multiply(char_output_tensor, (1.0 - attention_output))

                elif self.config["char_integration_method"] == "none":
                    input_tensor = input_tensor
                else:
                    raise ValueError("Unknown char integration method")

        dropout_input = self.config["dropout_input"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32))
        input_tensor =  tf.nn.dropout(input_tensor, dropout_input, name="dropout_word")
        input_tensor = tf.Print(input_tensor, [tf.shape(input_tensor), input_tensor], 'input_tensor: ')
        print('input_tensor: {}'.format(input_tensor))

        word_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell(self.config["word_recurrent_size"],
            use_peepholes=self.config["lstm_use_peepholes"],
            state_is_tuple=True,
            initializer=self.initializer,
            reuse=False)
        word_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell(self.config["word_recurrent_size"],
            use_peepholes=self.config["lstm_use_peepholes"],
            state_is_tuple=True,
            initializer=self.initializer,
            reuse=False)

        with tf.control_dependencies([tf.assert_equal(tf.shape(self.word_ids)[1], tf.reduce_max(self.sentence_lengths), message="Sentence dimensions don't match")]):
            (lstm_outputs_fw, lstm_outputs_bw), _ = tf.nn.bidirectional_dynamic_rnn(word_lstm_cell_fw, word_lstm_cell_bw, input_tensor, sequence_length=self.sentence_lengths, dtype=tf.float32, time_major=False)

        dropout_word_lstm = self.config["dropout_word_lstm"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32))
        lstm_outputs_fw =  tf.nn.dropout(lstm_outputs_fw, dropout_word_lstm)
        lstm_outputs_bw =  tf.nn.dropout(lstm_outputs_bw, dropout_word_lstm)

        if self.config["lmcost_lstm_gamma"] > 0.0:
            self.loss += self.config["lmcost_lstm_gamma"] * self.construct_lmcost(lstm_outputs_fw, lstm_outputs_bw, self.sentence_lengths, self.word_ids, "separate", "lmcost_lstm_separate")
        if self.config["lmcost_joint_lstm_gamma"] > 0.0:
            self.loss += self.config["lmcost_joint_lstm_gamma"] * self.construct_lmcost(lstm_outputs_fw, lstm_outputs_bw, self.sentence_lengths, self.word_ids, "joint", "lmcost_lstm_joint")

        processed_tensor = tf.concat([lstm_outputs_fw, lstm_outputs_bw], 2)
        processed_tensor_size = self.config["word_recurrent_size"] * 2

        if self.config["hidden_layer_size"] > 0:
            processed_tensor = tf.layers.dense(processed_tensor, self.config["hidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer)
            processed_tensor_size = self.config["hidden_layer_size"]

        self.scores = tf.layers.dense(processed_tensor, len(self.label2id), activation=None, kernel_initializer=self.initializer, name="output_ff")

        if self.config["crf_on_top"] == True:
            crf_num_tags = self.scores.get_shape()[2].value
            self.crf_transition_params = tf.get_variable("output_crf_transitions", [crf_num_tags, crf_num_tags], initializer=self.initializer)
            log_likelihood, self.crf_transition_params = tf.contrib.crf.crf_log_likelihood(self.scores, self.label_ids, self.sentence_lengths, transition_params=self.crf_transition_params)
            self.loss += self.config["main_cost"] * tf.reduce_sum(-log_likelihood)
        else:
            self.probabilities = tf.nn.softmax(self.scores)
            self.predictions = tf.argmax(self.probabilities, 2)
            loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.scores, labels=self.label_ids)
            mask = tf.sequence_mask(self.sentence_lengths, maxlen=tf.shape(self.word_ids)[1])
            loss_ = tf.boolean_mask(loss_, mask)
            self.loss += self.config["main_cost"] * tf.reduce_sum(loss_)

        self.train_op = self.construct_optimizer(self.config["opt_strategy"], self.loss, self.learningrate, self.config["clip"])


    def construct_lmcost(self, input_tensor_fw, input_tensor_bw, sentence_lengths, target_ids, lmcost_type, name):
        with tf.variable_scope(name):
            lmcost_max_vocab_size = min(len(self.word2id), self.config["lmcost_max_vocab_size"])
            target_ids = tf.where(tf.greater_equal(target_ids, lmcost_max_vocab_size-1), x=(lmcost_max_vocab_size-1)+tf.zeros_like(target_ids), y=target_ids)
            cost = 0.0
            if lmcost_type == "separate":
                lmcost_fw_mask = tf.sequence_mask(sentence_lengths, maxlen=tf.shape(target_ids)[1])[:,1:]
                lmcost_bw_mask = tf.sequence_mask(sentence_lengths, maxlen=tf.shape(target_ids)[1])[:,:-1]
                lmcost_fw = self._construct_lmcost(input_tensor_fw[:,:-1,:], lmcost_max_vocab_size, lmcost_fw_mask, target_ids[:,1:], name=name+"_fw")
                lmcost_bw = self._construct_lmcost(input_tensor_bw[:,1:,:], lmcost_max_vocab_size, lmcost_bw_mask, target_ids[:,:-1], name=name+"_bw")
                cost += lmcost_fw + lmcost_bw
            elif lmcost_type == "joint":
                joint_input_tensor = tf.concat([input_tensor_fw[:,:-2,:], input_tensor_bw[:,2:,:]], axis=-1)
                lmcost_mask = tf.sequence_mask(sentence_lengths, maxlen=tf.shape(target_ids)[1])[:,1:-1]
                cost += self._construct_lmcost(joint_input_tensor, lmcost_max_vocab_size, lmcost_mask, target_ids[:,1:-1], name=name+"_joint")
            else:
                raise ValueError("Unknown lmcost_type: " + str(lmcost_type))
            return cost


    def _construct_lmcost(self, input_tensor, lmcost_max_vocab_size, lmcost_mask, target_ids, name):
        with tf.variable_scope(name):
            lmcost_hidden_layer = tf.layers.dense(input_tensor, self.config["lmcost_hidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer)
            lmcost_output = tf.layers.dense(lmcost_hidden_layer, lmcost_max_vocab_size, activation=None, kernel_initializer=self.initializer)
            lmcost_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=lmcost_output, labels=target_ids)
            lmcost_loss = tf.where(lmcost_mask, lmcost_loss, tf.zeros_like(lmcost_loss))
            return tf.reduce_sum(lmcost_loss)
        
    
    def construct_optimizer(self, opt_strategy, loss, learningrate, clip):
        optimizer = None
        if opt_strategy == "adadelta":
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=learningrate)
        elif opt_strategy == "adam":
            optimizer = tf.train.AdamOptimizer(learning_rate=learningrate)
        elif opt_strategy == "sgd":
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=learningrate)
        else:
            raise ValueError("Unknown optimisation strategy: " + str(opt_strategy))

        if clip > 0.0:
            grads, vs     = zip(*optimizer.compute_gradients(loss))
            grads, gnorm  = tf.clip_by_global_norm(grads, clip)
            train_op = optimizer.apply_gradients(zip(grads, vs))
        else:
            train_op = optimizer.minimize(loss)
        return train_op
    
    def translate2id(self, token, token2id, unk_token, lowercase=False, replace_digits=False, singletons=None, singletons_prob=0.0):
        if lowercase == True:
            token = token.lower()
        if replace_digits == True:
            token = re.sub(r'\d', '0', token)

        token_id = None
        if singletons != None and token in singletons and token in token2id and unk_token != None and numpy.random.uniform() < singletons_prob:
            token_id = token2id[unk_token]
        elif token in token2id:
            token_id = token2id[token]
        elif unk_token != None:
            token_id = token2id[unk_token]
        else:
            raise ValueError("Unable to handle value, no UNK token: " + str(token))
        return token_id
    
    def create_input_dictionary_for_batch(self, batch, is_training, learningrate):

        sentence_lengths = numpy.array([len(sentence) for sentence in batch])
        max_sentence_length = sentence_lengths.max()
        # Create list of [x1, x2, ...] where xi is the length of the longest word in that sentence
        max_word_length = numpy.array([ numpy.array([len(word[0]) for word in sentence]).max() for sentence in batch ]).max()

        if self.config["allowed_word_length"] > 0 and self.config["allowed_word_length"] < max_word_length:
            max_word_length = min(max_word_length, self.config["allowed_word_length"])

        word_ids = numpy.zeros((len(batch), max_sentence_length), dtype=numpy.int32)
        char_ids = numpy.zeros((len(batch), max_sentence_length, max_word_length), dtype=numpy.int32)
        word_lengths = numpy.zeros((len(batch), max_sentence_length), dtype=numpy.int32)
        label_ids = numpy.zeros((len(batch), max_sentence_length), dtype=numpy.int32)

        singletons = self.singletons if is_training == True else None
        singletons_prob = self.config["singletons_prob"] if is_training == True else 0.0
        for i in range(len(batch)):
            for j in range(len(batch[i])):
                word_ids[i][j] = self.translate2id(batch[i][j][0], self.word2id, self.UNK, lowercase=self.config["lowercase"], replace_digits=self.config["replace_digits"], singletons=singletons, singletons_prob=singletons_prob)
                label_ids[i][j] = self.translate2id(batch[i][j][-1], self.label2id, None)
                word_lengths[i][j] = len(batch[i][j][0])
                for k in range(min(len(batch[i][j][0]), max_word_length)):
                    char_ids[i][j][k] = self.translate2id(batch[i][j][0][k], self.char2id, self.CUNK)

        input_dictionary = {self.word_ids: word_ids, self.char_ids: char_ids, self.sentence_lengths: sentence_lengths, self.word_lengths: word_lengths, self.label_ids: label_ids, self.learningrate: learningrate, self.is_training: is_training}
        return input_dictionary

### 3.1 Create an instance

In [11]:
labeler = SequenceLabeler(config)

In [12]:
labeler.build_vocabs(data_train, data_dev, data_test, config['preload_vectors'])

n_words: 13470
n_chars: 97
n_labels: 2
n_singletons: 6626


In [13]:
#labler.word2id
#labeler.char2id

### 3.2 Construct network

In [14]:
labeler.construct_network()

self.word_ids: Tensor("word_ids:0", shape=(?, ?), dtype=int32)
self.word_embeddings: <tf.Variable 'word_embeddings:0' shape=(13470, 300) dtype=float32_ref>
input_tensor: Tensor("Print:0", shape=(?, ?, 300), dtype=float32)
char_input_tensor: Tensor("chars/Print:0", shape=(?, ?, ?, 100), dtype=float32)
char_input_tensor: Tensor("chars/Print_1:0", shape=(?, ?, 100), dtype=float32)
char_ouput_tensor: Tensor("chars/Print_2:0", shape=(?, 200), dtype=float32)
char_ouput_tensor: Tensor("chars/Print_3:0", shape=(?, ?, 200), dtype=float32)
char_ouput_tensor: Tensor("chars/Print_4:0", shape=(?, ?, 200), dtype=float32)
char_ouput_tensor: Tensor("chars/Print_5:0", shape=(?, ?, 50), dtype=float32)
input_tensor: Tensor("chars/Print_6:0", shape=(?, ?, 350), dtype=float32)
input_tensor: Tensor("Print_1:0", shape=(?, ?, 350), dtype=float32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [15]:
labeler.word_embeddings

<tf.Variable 'word_embeddings:0' shape=(13470, 300) dtype=float32_ref>

In [16]:
labeler.char_embeddings

<tf.Variable 'chars/char_embeddings:0' shape=(97, 100) dtype=float32_ref>

In [17]:
labeler.char_ids

<tf.Tensor 'char_ids:0' shape=(?, ?, ?) dtype=int32>

### 3.3 Populate `feed_dict` dictionary for tensorflow placeholders 

In [18]:
# test batch
batch = [data_train[i] for i in batches_of_sentence_ids[0]]
print('Chosen batch: (shape [32, 1])\n{}\n'.format(batches_of_sentence_ids[0]))
print('Actual batch: (shape [32, ?, 2])')
for index, value in enumerate(batch):
    print('  {}: {}\n'.format(index, value))

Chosen batch: (shape [32, 1])
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]

Actual batch: (shape [32, ?, 2])
  0: [['Dear', 'c'], ['Sir', 'c'], ['or', 'c'], ['Madam', 'c'], [',', 'c']]

  1: [['I', 'c'], ['am', 'c'], ['writing', 'c'], ['in', 'c'], ['order', 'c'], ['to', 'c'], ['express', 'c'], ['my', 'c'], ['disappointment', 'c'], ['about', 'i'], ['your', 'c'], ['musical', 'c'], ['show', 'c'], ['"', 'c'], ['Over', 'c'], ['the', 'c'], ['Rainbow', 'c'], ['"', 'c'], ['.', 'c']]

  2: [['I', 'c'], ['saws', 'i'], ['the', 'c'], ['show', 'c'], ["'s", 'c'], ['advertisement', 'c'], ['hanging', 'c'], ['up', 'c'], ['of', 'i'], ['a', 'c'], ['wall', 'c'], ['in', 'c'], ['London', 'c'], ['where', 'c'], ['I', 'c'], ['was', 'c'], ['spending', 'c'], ['my', 'c'], ['holiday', 'c'], ['with', 'c'], ['some', 'c'], ['friends', 'c'], ['.', 'c']]

  3: [['I', 'c'], ['convinced', 'c'], ['them', 'c'], ['to', 'c'], ['go', 'c'], ['there', 'c'

Explanation of `input_dictionary` format:

`self.word_ids`:

Each number corresponds to the word index, which can be looked up against `self.word2id`. For example, 58 = Dear, 291 = Sir, 57 = or etc.

`self.char_ids`:

Similarly, each number corresponds to the character index, which can be looked up against `self.char2id`. For example, 'dear' would become enumerated as 32 = d, 1 = e, 4 = a, etc.

In [19]:
input_dictionary = labeler.create_input_dictionary_for_batch(batch, is_training = True, learningrate = config["learningrate"])
for key, value in input_dictionary.items():
    try: 
        shape = value.shape 
    except: 
        shape = 'NA'
    print('key: {}\nvalue: shape={}\n{}\n'.format(key, shape, value))

key: Tensor("word_ids:0", shape=(?, ?), dtype=int32)
value: shape=(32, 35)
[[   58   291    57 ...     0     0     0]
 [    2    39   120 ...     0     0     0]
 [    2 13443     3 ...     0     0     0]
 ...
 [    8   166    92 ...     0     0     0]
 [   94   206    98 ...     0     0     0]
 [   65    78    21 ...     0     0     0]]

key: Tensor("char_ids:0", shape=(?, ?, ?), dtype=int32)
value: shape=(32, 35, 14)
[[[32  1  4 ...  0  0  0]
  [29  6  8 ...  0  0  0]
  [ 3  8  0 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 [[21  0  0 ...  0  0  0]
  [ 4 14  0 ...  0  0  0]
  [16  8  6 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 [[21  0  0 ...  0  0  0]
  [ 7  4 16 ...  0  0  0]
  [ 2  9  1 ...  0  0  0]
  ...
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]
  [ 0  0  0 ...  0  0  0]]

 ...

 [[21  5  0 ...  0  0  0]
  [ 9  6  7 ...  0  0  0]
  [10  1  2 ...  0  0  0]
  

In [20]:
input_dictionary[labeler.word_ids]

array([[   58,   291,    57, ...,     0,     0,     0],
       [    2,    39,   120, ...,     0,     0,     0],
       [    2, 13443,     3, ...,     0,     0,     0],
       ...,
       [    8,   166,    92, ...,     0,     0,     0],
       [   94,   206,    98, ...,     0,     0,     0],
       [   65,    78,    21, ...,     0,     0,     0]])

### 3.4 Run a batch
Refer to `labeler.process_batch`

In [21]:
is_training = True
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    cost, predicted_labels_, predicted_probs_ = \
    sess.run([labeler.loss, labeler.predictions, labeler.probabilities] + ([labeler.train_op] if is_training == True else []), feed_dict=input_dictionary)[:3]

In [23]:
labeler.word2id

OrderedDict([('<unk>', 0),
             ('.', 1),
             ('i', 2),
             ('the', 3),
             (',', 4),
             ('to', 5),
             ('and', 6),
             ('you', 7),
             ('in', 8),
             ('a', 9),
             ('of', 10),
             ('it', 11),
             ('was', 12),
             ('that', 13),
             ('is', 14),
             ('for', 15),
             ('my', 16),
             ('have', 17),
             ('we', 18),
             ('be', 19),
             ('at', 20),
             ('would', 21),
             ('but', 22),
             ('your', 23),
             ('because', 24),
             ("n't", 25),
             ('me', 26),
             ('like', 27),
             ('very', 28),
             ('not', 29),
             ('this', 30),
             ('are', 31),
             ('with', 32),
             ('will', 33),
             ('on', 34),
             ('about', 35),
             ('as', 36),
             ('all', 37),
             ('do', 38),