In [1]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


In [2]:
import os
import time
import sys

In [3]:
tf.reset_default_graph()

# data and embedding utils

In [4]:
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

In [5]:
data_dir = "../data"

In [6]:
def load_vocab(fname):
    result = {}
    with open(fname, 'r') as f:
        for idx, word in enumerate(f):
            word = word.strip()
            result[word] = idx
    return result

In [7]:
vocab_words = load_vocab(os.path.join(data_dir, "words.txt"))
vocab_tags = load_vocab(os.path.join(data_dir, "tags.txt"))
vocab_chars = load_vocab(os.path.join(data_dir, "chars.txt"))

In [8]:
nwords = len(vocab_words)
nchars = len(vocab_chars)
ntags = len(vocab_tags)

In [9]:
embeddings_filename = os.path.join(data_dir, "glove.6B.300d.trimmed.npz")
with np.load(embeddings_filename) as data:
    train_embeddings = data["embeddings"]

In [10]:
class CoNLLDataset(object):
    """Class that iterates over CoNLL Dataset

    __iter__ method yields a tuple (words, tags)
        words: list of raw words
        tags: list of raw tags

    If processing_word and processing_tag are not None,
    optional preprocessing is appplied

    Example:
        ```python
        data = CoNLLDataset(filename)
        for sentence, tags in data:
            pass
        ```

    """
    def __init__(self, filename, processing_word=None, processing_tag=None,
                 max_iter=None):
        """
        Args:
            filename: path to the file
            processing_words: (optional) function that takes a word as input
            processing_tags: (optional) function that takes a tag as input
            max_iter: (optional) max number of sentences to yield

        """
        self.filename = filename
        self.processing_word = processing_word
        self.processing_tag = processing_tag
        self.max_iter = max_iter
        self.length = None


    def __iter__(self):
        niter = 0
        with open(self.filename) as f:
            words, tags = [], []
            for line in f:
                line = line.strip()
                if (len(line) == 0 or line.startswith("-DOCSTART-")):
                    if len(words) != 0:
                        niter += 1
                        if self.max_iter is not None and niter > self.max_iter:
                            break
                        yield words, tags
                        words, tags = [], []
                else:
                    ls = line.split(' ')
                    word, tag = ls[0],ls[-1]
                    if self.processing_word is not None:
                        word = self.processing_word(word)
                    if self.processing_tag is not None:
                        tag = self.processing_tag(tag)
                    words += [word]
                    tags += [tag]


    def __len__(self):
        """Iterates once over the corpus to set and store length"""
        if self.length is None:
            self.length = 0
            for _ in self:
                self.length += 1

        return self.length

In [11]:
def preprocessor_f(vocab_words=None, vocab_chars=None,
                    lowercase=False, chars=False, allow_unk=True):
    """Return lambda function that transform a word (string) into list,
    or tuple of (list, id) of int corresponding to the ids of the word and
    its corresponding characters.

    Args:
        vocab: dict[word] = idx

    Returns:
        f("cat") = ([12, 4, 32], 12345)
                 = (list of char ids, word id)

    """
    def f(word):
        # 0. get chars of words
        if vocab_chars is not None and chars == True:
            char_ids = []
            for char in word:
                # ignore chars out of vocabulary
                if char in vocab_chars:
                    char_ids += [vocab_chars[char]]

        # 1. preprocess word
        if lowercase:
            word = word.lower()
        if word.isdigit():
            word = NUM

        # 2. get id of word
        if vocab_words is not None:
            if word in vocab_words:
                word = vocab_words[word]
            else:
                if allow_unk:
                    word = vocab_words[UNK]
                else:
                    raise Exception("Unknow key is not allowed. Check that "\
                                    "your vocab (tags?) is correct")

        # 3. return tuple char ids, word id
        if vocab_chars is not None and chars == True:
            return char_ids, word
        else:
            return word

    return f

In [12]:
def _pad_sequences(sequences, pad_tok, max_length):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with

    Returns:
        a list of list where each sublist has same length
    """
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
        sequence_padded +=  [seq_]
        sequence_length += [min(len(seq), max_length)]

    return sequence_padded, sequence_length


def pad_sequences(sequences, pad_tok, nlevels=1):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
        nlevels: "depth" of padding, for the case where we have characters ids

    Returns:
        a list of list where each sublist has same length

    """
    if nlevels == 1:
        max_length = max(map(lambda x : len(x), sequences))
        sequence_padded, sequence_length = _pad_sequences(sequences,
                                            pad_tok, max_length)

    elif nlevels == 2:
        max_length_word = max([max(map(lambda x: len(x), seq))
                               for seq in sequences])
        sequence_padded, sequence_length = [], []
        for seq in sequences:
            # all words are same length now
            sp, sl = _pad_sequences(seq, pad_tok, max_length_word)
            sequence_padded += [sp]
            sequence_length += [sl]

        max_length_sentence = max(map(lambda x : len(x), sequences))
        sequence_padded, _ = _pad_sequences(sequence_padded,
                [pad_tok]*max_length_word, max_length_sentence)
        sequence_length, _ = _pad_sequences(sequence_length, 0,
                max_length_sentence)

    return sequence_padded, sequence_length

In [13]:
def get_chunk_type(tok, idx_to_tag):
    """
    Args:
        tok: id of token, ex 4
        idx_to_tag: dictionary {4: "B-PER", ...}

    Returns:
        tuple: "B", "PER"

    """
    tag_name = idx_to_tag[tok]
    tag_class = tag_name.split('-')[0]
    tag_type = tag_name.split('-')[-1]
    return tag_class, tag_type

def get_chunks(seq, tags):
    """Given a sequence of tags, group entities and their position

    Args:
        seq: [4, 4, 0, 0, ...] sequence of labels
        tags: dict["O"] = 4

    Returns:
        list of (chunk_type, chunk_start, chunk_end)

    Example:
        seq = [4, 5, 0, 3]
        tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
        result = [("PER", 0, 2), ("LOC", 3, 4)]

    """
    default = tags[NONE]
    idx_to_tag = {idx: tag for tag, idx in tags.items()}
    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None

        # End of a chunk + start of a chunk!
        elif tok != default:
            tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
            if chunk_type is None:
                chunk_type, chunk_start = tok_chunk_type, i
            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok_chunk_type, i
        else:
            pass

    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)

    return chunks

In [14]:
class Progbar(object):
    """Progbar class copied from keras (https://github.com/fchollet/keras/)

    Displays a progress bar.
    Small edit : added strict arg to update
    # Arguments
        target: Total number of steps expected.
        interval: Minimum visual progress update interval (in seconds).
    """

    def __init__(self, target, width=30, verbose=1):
        self.width = width
        self.target = target
        self.sum_values = {}
        self.unique_values = []
        self.start = time.time()
        self.total_width = 0
        self.seen_so_far = 0
        self.verbose = verbose

    def update(self, current, values=[], exact=[], strict=[]):
        """
        Updates the progress bar.
        # Arguments
            current: Index of current step.
            values: List of tuples (name, value_for_last_step).
                The progress bar will display averages for these values.
            exact: List of tuples (name, value_for_last_step).
                The progress bar will display these values directly.
        """

        for k, v in values:
            if k not in self.sum_values:
                self.sum_values[k] = [v * (current - self.seen_so_far),
                                      current - self.seen_so_far]
                self.unique_values.append(k)
            else:
                self.sum_values[k][0] += v * (current - self.seen_so_far)
                self.sum_values[k][1] += (current - self.seen_so_far)
        for k, v in exact:
            if k not in self.sum_values:
                self.unique_values.append(k)
            self.sum_values[k] = [v, 1]

        for k, v in strict:
            if k not in self.sum_values:
                self.unique_values.append(k)
            self.sum_values[k] = v

        self.seen_so_far = current

        now = time.time()
        if self.verbose == 1:
            prev_total_width = self.total_width
            sys.stdout.write("\b" * prev_total_width)
            sys.stdout.write("\r")

            numdigits = int(np.floor(np.log10(self.target))) + 1
            barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
            bar = barstr % (current, self.target)
            prog = float(current)/self.target
            prog_width = int(self.width*prog)
            if prog_width > 0:
                bar += ('='*(prog_width-1))
                if current < self.target:
                    bar += '>'
                else:
                    bar += '='
            bar += ('.'*(self.width-prog_width))
            bar += ']'
            sys.stdout.write(bar)
            self.total_width = len(bar)

            if current:
                time_per_unit = (now - self.start) / current
            else:
                time_per_unit = 0
            eta = time_per_unit*(self.target - current)
            info = ''
            if current < self.target:
                info += ' - ETA: %ds' % eta
            else:
                info += ' - %ds' % (now - self.start)
            for k in self.unique_values:
                if type(self.sum_values[k]) is list:
                    info += ' - %s: %.4f' % (k,
                        self.sum_values[k][0] / max(1, self.sum_values[k][1]))
                else:
                    info += ' - %s: %s' % (k, self.sum_values[k])

            self.total_width += len(info)
            if prev_total_width > self.total_width:
                info += ((prev_total_width-self.total_width) * " ")

            sys.stdout.write(info)
            sys.stdout.flush()

            if current >= self.target:
                sys.stdout.write("\n")

        if self.verbose == 2:
            if current >= self.target:
                info = '%ds' % (now - self.start)
                for k in self.unique_values:
                    info += ' - %s: %.4f' % (k,
                        self.sum_values[k][0] / max(1, self.sum_values[k][1]))
                sys.stdout.write(info + "\n")

    def add(self, n, values=[]):
        self.update(self.seen_so_far+n, values)

# define graph

### placeholders

In [15]:
# placeholders

# shape = (batch size, max length of sentence in batch)
word_ids = tf.placeholder(tf.int32, shape=[None, None],
                name="word_ids")

# shape = (batch size)
sequence_lengths = tf.placeholder(tf.int32, shape=[None],
                name="sequence_lengths")

# shape = (batch size, max length of sentence, max length of word)
char_ids = tf.placeholder(tf.int32, shape=[None, None, None],
                name="char_ids")

# shape = (batch_size, max_length of sentence)
word_lengths_v = tf.placeholder(tf.int32, shape=[None, None],
                name="word_lengths")

# shape = (batch size, max length of sentence in batch)
labels = tf.placeholder(tf.int32, shape=[None, None],
                name="labels")

# hyper parameters
dropout = tf.placeholder(dtype=tf.float32, shape=[],
                name="dropout")
lr_var = tf.placeholder(dtype=tf.float32, shape=[],
                name="lr")

In [16]:

dim_char = 100
hidden_size_char = 100
hidden_size_lstm = 300
nepochs          = 15
conf_dropout     = 0.5
batch_size       = 20
lr_method        = "adam"
lr_val               = 0.001
lr_decay         = 0.9
clip             = -1 # if negative, no clipping
nepoch_no_imprv_conf  = 3

### embeddings

- [LSTM Cell](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/LSTMCell)
- [Embedding lookup](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup)

In [17]:
# word embeddings

with tf.variable_scope("words"):
    _word_embeddings = tf.Variable(
            train_embeddings,
            name="_word_embeddings",
            dtype=tf.float32,
            trainable=False)
    # https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup
    word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
            word_ids, name="word_embeddings")

with tf.variable_scope("chars"):
    
    # get char embeddings matrix
    _char_embeddings = tf.get_variable(
            name="_char_embeddings",
            dtype=tf.float32,
            shape=[nchars, dim_char])
    char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
            char_ids, name="char_embeddings")

    # put the time dimension on axis=1
    s = tf.shape(char_embeddings)
    char_embeddings = tf.reshape(char_embeddings,
            shape=[s[0]*s[1], s[-2], dim_char])
    word_lengths = tf.reshape(word_lengths_v, shape=[s[0]*s[1]])

    # bi lstm on chars
    # https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/LSTMCell
    cell_fw = tf.contrib.rnn.LSTMCell(hidden_size_char,
            state_is_tuple=True)
    cell_bw = tf.contrib.rnn.LSTMCell(hidden_size_char,
            state_is_tuple=True)
    _output = tf.nn.bidirectional_dynamic_rnn(
            cell_fw, cell_bw, char_embeddings,
            sequence_length=word_lengths, dtype=tf.float32)

    # read and concat output
    _, ((_, output_fw), (_, output_bw)) = _output
    output = tf.concat([output_fw, output_bw], axis=-1)

    # shape = (batch size, max sentence length, char hidden size)
    output = tf.reshape(output,
            shape=[s[0], s[1], 2*hidden_size_char])
    word_embeddings = tf.concat([word_embeddings, output], axis=-1)

word_embeddings =  tf.nn.dropout(word_embeddings, conf_dropout)

### logits

In [18]:
"""Defines self.logits

For each word in each sentence of the batch, it corresponds to a vector
of scores, of dimension equal to the number of tags.
"""
with tf.variable_scope("bi-lstm"):
    cell_fw = tf.contrib.rnn.LSTMCell(hidden_size_lstm)
    cell_bw = tf.contrib.rnn.LSTMCell(hidden_size_lstm)
    (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
            cell_fw, cell_bw, word_embeddings,
            sequence_length=sequence_lengths, dtype=tf.float32)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.nn.dropout(output, conf_dropout)

with tf.variable_scope("proj"):
    W = tf.get_variable("W", dtype=tf.float32,
            shape=[2*hidden_size_lstm, ntags])

    b = tf.get_variable("b", shape=[ntags],
            dtype=tf.float32, initializer=tf.zeros_initializer())

    nsteps = tf.shape(output)[1]
    output = tf.reshape(output, [-1, 2*hidden_size_lstm])
    pred = tf.matmul(output, W) + b
    logits = tf.reshape(pred, [-1, nsteps, ntags])

In [19]:

# pred
# no-op for CRF

### loss

In [20]:
log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
        logits, labels, sequence_lengths)
#self.trans_params = trans_params # need to evaluate it for decoding
loss = tf.reduce_mean(-log_likelihood)

# for tensorboard
tf.summary.scalar("loss", loss)

<tf.Tensor 'loss:0' shape=() dtype=string>

### train op

In [21]:
"""Defines self.train_op that performs an update on a batch

Args:
    lr_method: (string) sgd method, for example "adam"
    lr: (tf.placeholder) tf.float32, learning rate
    loss: (tensor) tf.float32 loss to minimize
    clip: (python float) clipping of gradient. If < 0, no clipping

"""
_lr_m = lr_method.lower() # lower to make sure

train_op = None
with tf.variable_scope("train_step"):
    if _lr_m == 'adam': # sgd method
        optimizer = tf.train.AdamOptimizer(lr_var)
    elif _lr_m == 'adagrad':
        optimizer = tf.train.AdagradOptimizer(lr_var)
    elif _lr_m == 'sgd':
        optimizer = tf.train.GradientDescentOptimizer(lr_var)
    elif _lr_m == 'rmsprop':
        optimizer = tf.train.RMSPropOptimizer(lr_var)
    else:
        raise NotImplementedError("Unknown method {}".format(_lr_m))

    if clip > 0: # gradient clipping if clip is positive
        grads, vs     = zip(*optimizer.compute_gradients(loss))
        grads, gnorm  = tf.clip_by_global_norm(grads, clip)
        train_op = optimizer.apply_gradients(zip(grads, vs))
    else:
        train_op = optimizer.minimize(loss)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


# initialize session

In [22]:
sess = tf.Session()
saver = tf.train.Saver()

sess.run(tf.global_variables_initializer())

# Train

In [23]:
def minibatches(data, minibatch_size):
    """
    Args:
        data: generator of (sentence, tags) tuples
        minibatch_size: (int)

    Yields:
        list of tuples

    """
    x_batch, y_batch = [], []
    for (x, y) in data:
        if len(x_batch) == minibatch_size:
            yield x_batch, y_batch
            x_batch, y_batch = [], []

        if type(x[0]) == tuple:
            x = zip(*x)
        x_batch += [x]
        y_batch += [y]

    if len(x_batch) != 0:
        yield x_batch, y_batch

############################################################     
def get_feed_dict(words, labels_val=None, lr_val=None, dropout_val=None):
    """Given some data, pad it and build a feed dictionary

    Args:
        words: list of sentences. A sentence is a list of ids of a list of
            words. A word is a list of ids
        labels_val: list of ids
        lr: (float) learning rate
        dropout: (float) keep prob

    Returns:
        dict {placeholder: value}

    """
    # perform padding of the given data
    char_ids_feed, word_ids_feed = zip(*words)
    word_ids_feed, sequence_lengths_feed = pad_sequences(word_ids_feed, 0)
    char_ids_feed, word_lengths_feed = pad_sequences(char_ids_feed, pad_tok=0,
        nlevels=2)
    # keys are the tensorflow vars
    feed = {
            word_ids: word_ids_feed,
            sequence_lengths: sequence_lengths_feed,
            char_ids: char_ids_feed,
            word_lengths_v: word_lengths_feed
        }
    if labels_val is not None:
        labels_val, _ = pad_sequences(labels_val, 0)
        feed[labels] = labels_val

    if lr_val is not None:
        feed[lr_var] = lr_val

    if dropout is not None:
        feed[dropout] = dropout_val

    return feed, sequence_lengths_feed
    
############################################################

def predict_batch(words):
    """
    Args:
        words: list of sentences

    Returns:
        labels_pred: list of labels for each sentence
        sequence_length

    """
    fd, sequence_lengths = get_feed_dict(words, dropout_val=1.0)
    
    # get tag scores and transition params of CRF
    viterbi_sequences = []
    logits_v, trans_params_v = sess.run(
            [logits, trans_params], feed_dict=fd)
    #print(logits_v)
    # iterate over the sentences because no batching in vitervi_decode
    for logit, sequence_length in zip(logits_v, sequence_lengths):
        logit = logit[:sequence_length] # keep only the valid steps
        viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
                logit, trans_params_v)
        viterbi_sequences += [viterbi_seq]

    return viterbi_sequences, sequence_lengths


    
############################################################
def run_evaluate(test):
    """Evaluates performance on test set

    Args:
        test: dataset that yields tuple of (sentences, tags)

    Returns:
        metrics: (dict) metrics["acc"] = 98.4, ...

    """
    accs = []
    correct_preds, total_correct, total_preds = 0., 0., 0.
    for words, labels in minibatches(test, batch_size):
        labels_pred, sequence_lengths = predict_batch(words)

        for lab, lab_pred, length in zip(labels, labels_pred,
                                         sequence_lengths):
            lab      = lab[:length]
            lab_pred = lab_pred[:length]
            accs    += [a==b for (a, b) in zip(lab, lab_pred)]

            lab_chunks      = set(get_chunks(lab, vocab_tags))
            lab_pred_chunks = set(get_chunks(lab_pred,
                                            vocab_tags))

            correct_preds += len(lab_chunks & lab_pred_chunks)
            total_preds   += len(lab_pred_chunks)
            total_correct += len(lab_chunks)

    p   = correct_preds / total_preds if correct_preds > 0 else 0
    r   = correct_preds / total_correct if correct_preds > 0 else 0
    f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
    acc = np.mean(accs)

    return {"acc": 100*acc, "f1": 100*f1}


############################################################
def run_epoch(train, dev, epoch, file_writer, merged):
    """Performs one complete pass over the train set and evaluate on dev

    Args:
        train: dataset that yields tuple of sentences, tags
        dev: dataset
        epoch: (int) index of the current epoch

    Returns:
        f1: (python float), score to select model on, higher is better

    """
    # progbar stuff for logging
    nbatches = (len(train) + batch_size - 1) // batch_size
    prog = Progbar(target=nbatches)

    # iterate over dataset
    for i, (words, labels) in enumerate(minibatches(train, batch_size)):
        fd, _ = get_feed_dict(words, labels, lr_val=lr_val,
                dropout_val=conf_dropout)

        _, train_loss, summary = sess.run(
                [train_op, loss, merged], feed_dict=fd)

        prog.update(i + 1, [("train loss", train_loss)])

        # tensorboard
        if i % 10 == 0:
            file_writer.add_summary(summary, epoch*nbatches + i)

    metrics = run_evaluate(dev)
    msg = " - ".join(["{} {:04.2f}".format(k, v)
            for k, v in metrics.items()])
    print(msg)

    return metrics["f1"]

In [24]:
def save_session(sess, saver, dir_model):
    """Saves session = weights"""
    if not os.path.exists(dir_model):
        os.makedirs(dir_model)
    saver.save(sess, dir_model)


In [25]:
word_preprocessor = preprocessor_f(vocab_words=vocab_words, vocab_chars=vocab_chars, lowercase=True, chars=True)
tag_preprocessor = preprocessor_f(vocab_words=vocab_tags, lowercase=False, allow_unk=False)

dev   = CoNLLDataset(os.path.join(data_dir, "eng.testa"), word_preprocessor,
                     tag_preprocessor, None)
train = CoNLLDataset(os.path.join(data_dir, "eng.train"), word_preprocessor,
                     tag_preprocessor, None)

In [26]:
best_score = 0
nepoch_no_imprv = 0 # for early stopping

# tensorboard
merged      = tf.summary.merge_all()
dir_output = "./results/test/"
dir_model  = dir_output + "model.weights/"
path_log   = dir_output + "log.txt"
file_writer = tf.summary.FileWriter(dir_output, sess.graph)




In [None]:
print(merged), print(lr_var)

Tensor("Merge/MergeSummary:0", shape=(), dtype=string)
Tensor("lr:0", shape=(), dtype=float32)


(None, None)

In [None]:

for epoch in range(nepochs):
    print("Epoch {:} out of {:}".format(epoch + 1,
                nepochs))

    score = run_epoch(train, dev, epoch, file_writer, merged)
    lr_val *= lr_decay # decay learning rate

    # early stopping and saving best parameters
    if score >= best_score:
        nepoch_no_imprv = 0
        save_session(sess, saver, dir_model)
        best_score = score
        print("- new best score!")
    else:
        nepoch_no_imprv += 1
        if nepoch_no_imprv >= nepoch_no_imprv_conf:
            print("- early stopping {} epochs without "\
                    "improvement".format(nepoch_no_imprv))
            break

Epoch 1 out of 15
acc 97.23 - f1 83.21
- new best score!
Epoch 2 out of 15
150/703 [=====>........................] - ETA: 233s - train loss: 1.4334