In [35]:
from os import path, makedirs, rename, remove
from time import time
import argparse
import pickle
from pathlib import Path
import numpy as np
import functools
import json
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
import codecs

In [30]:
MODEL_DIRECTORY_CLASSIFICATION = 'estimator_text_classify/perceptron_model'
DATA_DIRECTORY = 'estimator_text_classify/data'
QUERY_FILENAME = 'sample_query.txt'
WORD_METADATA_FILENAME = 'word_metadata.tsv'
MAX_VOCABULARY_SIZE = 1000000
VERBOSITY = 'info'
WORDS_FEATURE = 'words'  # Name of the input words feature.
LENGTHS_FEATURE = 'lengths'
VOCAB_PROCESSOR_FILENAME = 'vocab_processor.pickle'

def process_vocabulary(train_sentences, test_sentences,
                       reuse=True, vocabulary_processor=None, extend=False, sequence_lengths=False):
    """Map words to integers, and then map sentences to integer sequences of length flags.max_doc_len, by truncating and
       padding as needed. This leads to an integer matrix of data which is what TensorFlow can work with. The processor
       is then saved to disk in a file determined by flags.

    Args:
       reuse: if True load the vocabulary_processor is loaded from disk if the file exists.
       vocabulary_processor: if not None, and it was not loaded from disk, the passed vocabulary_processor is used.
       extend: if True the vocabulary processor (loaded or passed) is extended.
       sequence_lengths: Whether to list the length of each document.
    """
    
    vocabulary_processor_path = path.join(MODEL_DIRECTORY_CLASSIFICATION, VOCAB_PROCESSOR_FILENAME)
    # If vocabulary_processor gets created/altered save it.
    if reuse and path.isfile(vocabulary_processor_path):
        vocabulary_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(vocabulary_processor_path)
        save_vocab_processor = extend
    elif vocabulary_processor is None:
        vocabulary_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
        vocabulary_processor.fit(train_sentences)
        save_vocab_processor = True
    elif extend:
        vocabulary_processor.vocabulary_.freeze(False)
        vocabulary_processor.fit(train_sentences)
        save_vocab_processor = True
    else:
        save_vocab_processor = False

    if train_sentences is not None:
        train_bow = np.array(list(vocabulary_processor.transform(train_sentences)))
    else:
        train_bow = None
    if test_sentences is not None:
        test_bow = np.array(list(vocabulary_processor.transform(test_sentences)))
    else:
        test_bow = None
    n_words = len(vocabulary_processor.vocabulary_)
    print('Number of words in vocabulary: %d' % n_words)

    if save_vocab_processor:
        if not path.isdir(MODEL_DIRECTORY):
            makedirs(MODEL_DIRECTORY)
        vocabulary_processor.save(vocabulary_processor_path)

    if sequence_lengths:
        def calculate_lengths(arr):
            return arr.shape[1] - (arr != 0)[:, ::-1].argmax(axis=1)
        train_lengths = calculate_lengths(train_bow) if train_bow is not None else None
        test_lengths = calculate_lengths(test_bow) if test_bow is not None else None
    else:
        train_lengths = test_lengths = None

    return train_bow, test_bow, train_lengths, test_lengths, vocabulary_processor, n_words
    

def estimator_spec_for_softmax_classification(logits, labels, mode, params):
    """Returns EstimatorSpec instance for softmax classification."""
    predicted_class = tf.argmax(logits, 1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'class': predicted_class,
                'prob': tf.nn.softmax(logits)
            })

    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    if mode == tf.estimator.ModeKeys.TRAIN:
        with tf.name_scope('OptimizeLoss'):
            optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
            train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # mode == EVAL
    eval_metric_ops = {
        'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_class)
    }
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

def bag_of_words_perceptron_model(features, labels, mode, params):
    """Perceptron architecture"""
    with tf.variable_scope('Perceptron'):
        bow_column = tf.feature_column.categorical_column_with_identity(
            WORDS_FEATURE, num_buckets=params.n_words)
        # Maps sequences of integers < params.n_words
        # to params.output_dim dimensional real-valued vectors
        # by taking the mean over the word (i.e. integer index) embedding values.
        bow_embedding_column = tf.feature_column.embedding_column(
            bow_column, dimension=params.output_dim)
        logits = tf.feature_column.input_layer(
            features,
            feature_columns=[bow_embedding_column])

    return estimator_spec_for_softmax_classification(logits, labels, mode, params)

def input_fn(x, y=None, lengths=None, batch_size=None, num_epochs=None, shuffle=False):
    """Generic input function to be used as the input_fn arguments for Experiment or directly with Estimators."""
    if batch_size is None and x is not None:
        batch_size = len(x)
    x_dict = {WORDS_FEATURE: x}
    if lengths is not None:
        x_dict['LENGTHS_FEATURE'] = lengths
    return tf.estimator.inputs.numpy_input_fn(
        x_dict,
        y,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle)

def predict(x_data, x_lengths, model_fn, output_dim):
    """Performs classification on the given x_data using the model given by model_fn."""
    hparams = tf.contrib.training.HParams(
        n_words=MAX_VOCABULARY_SIZE,
        output_dim=output_dim,
    )

    run_config = tf.contrib.learn.RunConfig()
    run_config = run_config.replace(model_dir=MODEL_DIRECTORY_CLASSIFICATION)
    predictions = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params=hparams
    ).predict(input_fn(x_data, lengths=x_lengths, num_epochs=1))
    return [p['class'] for p in predictions]

output_dim = 14
with open(QUERY_FILENAME, 'r', encoding='utf-8') as txt:
    data = txt.read()

queries = np.array(data.split('.'))
#queries = np.loadtxt(data, delimiter='.')
classes_filename = path.join(DATA_DIRECTORY, 'classes.txt')
classes = pd.read_csv(classes_filename, header=None, names=['class'])
_, x_query, _, query_lengths, _, _ = process_vocabulary(
        None, queries, reuse=True, sequence_lengths=False)
classifications = predict(x_query, query_lengths, bag_of_words_perceptron_model, output_dim)
for i, query in enumerate(queries):
    print('The model classifies "{}" as a member of the class {}.'.format(
           query, classes['class'][classifications[i]]))

Number of words in vocabulary: 822383
The model classifies "The BBC produced spoof on the â€œReal Housewivesâ€ TV programmes, which has a comedic Islamic State twist, has been criticised by Leftists and Muslims who claim the sketch is offensive" as a member of the class Film.
The model classifies " " as a member of the class Artist.


In [41]:

DATADIR = 'sequence_tagging_ner/data'
PARAMS = 'sequence_tagging_ner/results/params.json'
MODELDIR = 'sequence_tagging_ner/results/model'
def model_fn(features, labels, mode, params):
    # For serving, features are a bit different
    if isinstance(features, dict):
        features = features['words'], features['nwords']

    # Read vocabs and inputs
    dropout = params['dropout']
    words, nwords = features
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    vocab_words = tf.contrib.lookup.index_table_from_file(
        params['words'], num_oov_buckets=params['num_oov_buckets'])
    with Path(params['tags']).open() as f:
        indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
        num_tags = len(indices) + 1

    # Word Embeddings
    word_ids = vocab_words.lookup(words)
    glove = np.load(params['glove'])['embeddings']  # np.array
    variable = np.vstack([glove, [[0.]*params['dim']]])
    variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
    embeddings = tf.nn.embedding_lookup(variable, word_ids)
    embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)

    # LSTM
    t = tf.transpose(embeddings, perm=[1, 0, 2])
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
    output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
    output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.transpose(output, perm=[1, 0, 2])
    output = tf.layers.dropout(output, rate=dropout, training=training)

    # CRF
    logits = tf.layers.dense(output, num_tags)
    crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
    pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Predictions
        reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
            params['tags'])
        pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
        predictions = {
            'pred_ids': pred_ids,
            'tags': pred_strings
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
        # Loss
        vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
        tags = vocab_tags.lookup(labels)
        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
            logits, tags, nwords, crf_params)
        loss = tf.reduce_mean(-log_likelihood)

        # Metrics
        weights = tf.sequence_mask(nwords)
        metrics = {
            'acc': tf.metrics.accuracy(tags, pred_ids, weights),
            'precision': precision(tags, pred_ids, num_tags, indices, weights),
            'recall': recall(tags, pred_ids, num_tags, indices, weights),
            'f1': f1(tags, pred_ids, num_tags, indices, weights),
        }
        for metric_name, op in metrics.items():
            tf.summary.scalar(metric_name, op[1])

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:
            train_op = tf.train.AdamOptimizer().minimize(
                loss, global_step=tf.train.get_or_create_global_step())
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, train_op=train_op)
        
def pretty_print(line, preds):
    words = line.strip().split()
    noun_words = []
    for x in range(0, len(words)):
        if preds[x].decode('utf-8') != 'O':
            noun_words.append(words[x])
            
    lengths = [max(len(w), len(p)) for w, p in zip(noun_words, preds)]
    padded_words = [w + (l - len(w)) * ' ' for w, l in zip(noun_words, lengths)]
    padded_preds = [p.decode() + (l - len(p)) * ' ' for p, l in zip(preds, lengths)]
    print('words: {}'.format(' '.join(padded_words)))
    print('preds: {}'.format(' '.join(padded_preds)))


def predict_input_fn(line):
    # Words
    words = [w.encode() for w in line.strip().split()]
    nwords = len(words)

    # Wrapping in Tensors
    words = tf.constant([words], dtype=tf.string)
    nwords = tf.constant([nwords], dtype=tf.int32)

    return (words, nwords), None

with Path(PARAMS).open() as f:
    params = json.load(f)

    params['words'] = str(Path(DATADIR, 'vocab.words.txt'))
    params['chars'] = str(Path(DATADIR, 'vocab.chars.txt'))
    params['tags'] = str(Path(DATADIR, 'vocab.tags.txt'))
    params['glove'] = str(Path(DATADIR, 'glove.npz'))

    estimator = tf.estimator.Estimator(model_fn, MODELDIR, params=params)
    for i, query in enumerate(queries):
        predict_inpf = functools.partial(predict_input_fn, query)
        for pred in estimator.predict(predict_inpf):
            pretty_print(query, pred['tags'])
            break
        break
    

W0402 12:12:37.748011 15104 tf_logging.py:126] Input graph does not use tf.data.Dataset or contain a QueueRunner. That means predict yields forever. This is probably a mistake.


words: The  BBC   spoof the  â€œReal Housewivesâ€ TV   programmes, a     comedic Islamic State twist, Leftists Muslims the   sketch offensive
preds: I-NP I-ORG O     I-NP O       I-NP          I-NP I-ORG       I-ORG I-ORG   O       O     I-NP   I-NP     I-ORG   I-ORG I-ORG  O        
