In [1]:
import functools
import json
import logging
from pathlib import Path
import sys

import numpy as np
import tensorflow as tf
from tf_metrics import precision, recall, f1

In [2]:
DATADIR = './data'

In [3]:
# Logging
Path('results').mkdir(exist_ok=True)
tf.logging.set_verbosity(logging.INFO)
handlers = [
    logging.FileHandler('results/main.log'),
    logging.StreamHandler(sys.stdout)
]
logging.getLogger('tensorflow').handlers = handlers

In [4]:
def parse_fn(line_words, line_tags):
    '''
    Parses and converts words to bytes for a sentence.
    Asserts that tag sequence length is equal to word sequence length
    '''
    words = [word.encode() for word in line_words.strip().split()]
    tags = [tag.encode() for tag in line_tags.strip().split()]
    assert(len(words) == len(tags))
    return (words, len(words)), tags

def generator_fn(words, tags):
    '''
    Opens sentences and tags file, reads, parses it and streams it
    '''
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)

In [5]:
def input_fn(words, tags, params=None, shuffle_and_repeat=False):
    '''
    Input function for tf estimator. Responsible for dataset preprocessing
    '''
    params = params if params is not None else {}
    shapes = (([None], ()), [None])
    types = ((tf.string, tf.int32), tf.string)
    defaults = (('<pad>', 0), 'O')

    dataset = tf.data.Dataset.from_generator(
         functools.partial(generator_fn, words, tags),
         output_shapes=shapes, output_types=types)

    if shuffle_and_repeat:
         dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

    dataset = (dataset
                .padded_batch(params.get('batch_size', 20), shapes, defaults)
                .prefetch(1))
    return dataset

In [6]:
def model_fn(features, labels, mode, params):
    # For serving, features are a bit different
    if isinstance(features, dict):
        features = features['words'], features['nwords']

    # Read vocabs and inputs
    dropout = params['dropout']
    words, nwords = features
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    vocab_words = tf.contrib.lookup.index_table_from_file(
        params['words'], num_oov_buckets=params['num_oov_buckets'])
    with Path(params['tags']).open() as f:
        indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
        num_tags = len(indices) + 1

    # Word Embeddings
    word_ids = vocab_words.lookup(words)
    glove = np.load(params['glove'])['embeddings']  # np.array
    variable = np.vstack([glove, [[0.]*params['dim']]])
    variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
    embeddings = tf.nn.embedding_lookup(variable, word_ids)
    embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)

    # LSTM
    t = tf.transpose(embeddings, perm=[1, 0, 2])
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
    output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
    output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.transpose(output, perm=[1, 0, 2])
    output = tf.layers.dropout(output, rate=dropout, training=training)

    # CRF
    logits = tf.layers.dense(output, num_tags)
    crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
    pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Predictions
        reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
            params['tags'])
        pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
        predictions = {
            'pred_ids': pred_ids,
            'tags': pred_strings
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
        # Loss
        vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
        tags = vocab_tags.lookup(labels)
        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
            logits, tags, nwords, crf_params)
        loss = tf.reduce_mean(-log_likelihood)

        # Metrics
        weights = tf.sequence_mask(nwords)
        metrics = {
            'acc': tf.metrics.accuracy(tags, pred_ids, weights),
            'precision': precision(tags, pred_ids, num_tags, indices, weights),
            'recall': recall(tags, pred_ids, num_tags, indices, weights),
            'f1': f1(tags, pred_ids, num_tags, indices, weights),
        }
        for metric_name, op in metrics.items():
            tf.summary.scalar(metric_name, op[1])

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:
            train_op = tf.train.AdamOptimizer().minimize(
                loss, global_step=tf.train.get_or_create_global_step())
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, train_op=train_op)


In [7]:
params = {
    'dim': 300,
    'dropout': 0.5,
    'num_oov_buckets': 1,
    'epochs': 25,
    'batch_size': 20,
    'buffer': 15000,
    'lstm_size': 100,
    'words': str(Path(DATADIR, 'vocab.words.txt')),
    'chars': str(Path(DATADIR, 'vocab.chars.txt')),
    'tags': str(Path(DATADIR, 'vocab.tags.txt')),
    'glove': str(Path(DATADIR, 'glove.npz'))
}
with Path('results/params.json').open('w') as f:
        json.dump(params, f, indent=4, sort_keys=True)
def fwords(name):
        return str(Path(DATADIR, '{}.words.txt'.format(name)))
def ftags(name):
        return str(Path(DATADIR, '{}.tags.txt'.format(name)))
    
cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
estimator = tf.estimator.Estimator(model_fn, 'results/model', cfg, params)
# Estimator, train and evaluate
train_inpf = functools.partial(input_fn, fwords('train'), ftags('train'),
                                   params, shuffle_and_repeat=True)
eval_inpf = functools.partial(input_fn, fwords('testa'), ftags('testa'))

cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
estimator = tf.estimator.Estimator(model_fn, 'results/model', cfg, params)

train_spec = tf.estimator.TrainSpec(input_fn=train_inpf)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)


Using config: {'_session_config': None, '_num_ps_replicas': 0, '_task_id': 0, '_service': None, '_save_checkpoints_steps': None, '_tf_random_seed': None, '_model_dir': 'results/model', '_is_chief': True, '_save_checkpoints_secs': 120, '_save_summary_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001C2AE9661D0>, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_num_worker_replicas': 1, '_master': '', '_evaluation_master': '', '_task_type': 'worker', '_log_step_count_steps': 100, '_global_id_in_cluster': 0}
Using config: {'_session_config': None, '_num_ps_replicas': 0, '_task_id': 0, '_service': None, '_save_checkpoints_steps': None, '_tf_random_seed': None, '_model_dir': 'results/model', '_is_chief': True, '_save_checkpoints_secs': 120, '_save_summary_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001C2AE966080>, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Done calling model_fn.
Create CheckpointSaverHook.
Graph was finalized.
Running local_init_op.
Done running local_init_op.
Saving checkpoints for 1 into results/model\model.ckpt.
loss = 80.2449, step = 1
global_step/sec: 2.1945
loss = 17.5772, step = 101 (45.571 sec)
global_step/sec: 2.20937
loss = 21.867, step = 201 (45.262 sec)
Saving checkpoints for 222 into results/model\model.ckpt.
Loss for final step: 12.3363.
Calling model_fn.
Done calling model_fn.
Starting evaluation at 2019-04-01-11:27:33
Graph was finalized.
Restoring parameters from results/model\model.ckpt-222
Running local_init_op.
Done running local_init_op.
Evaluation [10/100]
Evaluation [20/100]
Evaluation [30/100]
Evaluation [40/100]
Evaluation [50/100]
Evaluation [60/100]
Evaluation [70/100]
Evaluation [80/100]
Evaluation [90/100]
Finished evaluation at 2019-04-01-11:27:50
Saving dict for global step 222: acc = 0.873012, f1 = 0.857378, global_step = 222, loss = 10.0478, precision = 0.858353, recall = 0.856405
Calling

Evaluation [60/100]
Evaluation [70/100]
Evaluation [80/100]
Evaluation [90/100]
Finished evaluation at 2019-04-01-11:45:13
Saving dict for global step 1778: acc = 0.946108, f1 = 0.941679, global_step = 1778, loss = 4.45696, precision = 0.944483, recall = 0.938892
Calling model_fn.
Done calling model_fn.
Create CheckpointSaverHook.
Graph was finalized.
Restoring parameters from results/model\model.ckpt-1778
Running local_init_op.
Done running local_init_op.
Saving checkpoints for 1779 into results/model\model.ckpt.
loss = 5.53828, step = 1779
global_step/sec: 2.05652
loss = 7.49457, step = 1879 (48.629 sec)
global_step/sec: 2.44413
loss = 6.89012, step = 1979 (40.910 sec)
Saving checkpoints for 2000 into results/model\model.ckpt.
Loss for final step: 4.64681.
Calling model_fn.
Done calling model_fn.
Starting evaluation at 2019-04-01-11:47:24
Graph was finalized.
Restoring parameters from results/model\model.ckpt-2000
Running local_init_op.
Done running local_init_op.
Evaluation [10/100]

KeyboardInterrupt: 

In [21]:
LINE = ' Nutanix is an enterprise storage company, Nutanix raised $101 million in a Series D round of financing in 2014 valuing the company at approximately $1 billion and It reached the $1 billion valuation in four years from its launch faster than many other companies who have reached the same valuation.'
DATADIR = './data'
PARAMS = './results/params.json'
MODELDIR = './results/model'
def pretty_print(line, preds):
    words = line.strip().split()
    noun_words = []
    for x in range(0, len(words)):
        if preds[x].decode('utf-8') != 'O':
            noun_words.append(words[x])
            
    lengths = [max(len(w), len(p)) for w, p in zip(noun_words, preds)]
    padded_words = [w + (l - len(w)) * ' ' for w, l in zip(noun_words, lengths)]
    padded_preds = [p.decode() + (l - len(p)) * ' ' for p, l in zip(preds, lengths)]
    print('words: {}'.format(' '.join(padded_words)))
    print('preds: {}'.format(' '.join(padded_preds)))


def predict_input_fn(line):
    # Words
    words = [w.encode() for w in line.strip().split()]
    nwords = len(words)

    # Wrapping in Tensors
    words = tf.constant([words], dtype=tf.string)
    nwords = tf.constant([nwords], dtype=tf.int32)

    return (words, nwords), None

with Path(PARAMS).open() as f:
    params = json.load(f)

    params['words'] = str(Path(DATADIR, 'vocab.words.txt'))
    params['chars'] = str(Path(DATADIR, 'vocab.chars.txt'))
    params['tags'] = str(Path(DATADIR, 'vocab.tags.txt'))
    params['glove'] = str(Path(DATADIR, 'glove.npz'))

    estimator = tf.estimator.Estimator(model_fn, MODELDIR, params=params)
    predict_inpf = functools.partial(predict_input_fn, LINE)
    for pred in estimator.predict(predict_inpf):
        pretty_print(LINE, pred['tags'])
        break

Using default config.
Using config: {'_session_config': None, '_num_ps_replicas': 0, '_task_id': 0, '_service': None, '_save_checkpoints_steps': None, '_tf_random_seed': None, '_model_dir': './results/model', '_is_chief': True, '_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001C2B62BC898>, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_num_worker_replicas': 1, '_master': '', '_evaluation_master': '', '_task_type': 'worker', '_log_step_count_steps': 100, '_global_id_in_cluster': 0}
Input graph does not use tf.data.Dataset or contain a QueueRunner. That means predict yields forever. This is probably a mistake.
Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from ./results/model\model.ckpt-2000
Running local_init_op.
Done running local_init_op.
words: Nutanix an enterprise storage company, Nutanix $101 million a    Series D round financin