In [None]:
from os import path, makedirs, rename, remove
from time import time
import argparse
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.utils import shuffle

In [None]:
DATA_DIRECTORY = 'data'
MAX_DOCUMENT_LENGTH = 10
MAX_VOCABULARY_SIZE = 1000000
EMBEDDING_DIM = 25
TF_SEED = 4242
MODEL_DIRECTORY = 'perceptron_mlp_model'
NUM_EPOCHS = 2
BATCH_SIZE = 32
LEARNING_RATE = 0.04
NP_SEED = 1234
CHECKPOINTS_PER_EPOCH = 5
WORD_METADATA_FILENAME = 'word_metadata.tsv'
VOCAB_PROCESSOR_FILENAME = 'vocab_processor.pickle'
DATA_FILENAME = 'data.pickle'
VERBOSITY = 'info'
WORDS_FEATURE = 'words'  # Name of the input words feature.
LENGTHS_FEATURE = 'lengths'  # Name of the document lengths feature (not used for BOW)

In [None]:
"""
Timing functions (MATLAB style)
"""
_tstart_stack = []
def tic():
    _tstart_stack.append(time())


def toc(fmt="Elapsed: %.2f s"):
    print(fmt % (time() - _tstart_stack.pop()))

In [None]:
def get_data(data_directory, classes_only=False):
    """Download the DBpedia data if necessary, and load data from the data_directory. If the files train.csv, test.csv
       and classes.txt are all in data_directory, then they are used (no download)."""
    # The function call load_dataset in the TensorFlow API is supposed to provide this functionality. However, there are
    # currently issues: https://github.com/tensorflow/tensorflow/issues/14698

    train_filename = path.join(data_directory, 'train.csv')
    test_filename = path.join(data_directory, 'test.csv')
    classes_filename = path.join(data_directory, 'classes.txt')
    has_train = path.isfile(train_filename)
    has_test = path.isfile(test_filename)
    has_classes = path.isfile(classes_filename)

    if not has_train or not has_test or not has_classes:
        # Download the data if necessary, using the API.
        tf.contrib.learn.datasets.text_datasets.maybe_download_dbpedia(data_directory)
        csv_subdir = 'dbpedia_csv'

        if has_train:
            remove(train_filename)
        rename(path.join(data_directory, csv_subdir, 'train.csv'), train_filename)
        if has_test:
            remove(test_filename)
        rename(path.join(data_directory, csv_subdir, 'test.csv'), test_filename)
        if has_classes:
            remove(classes_filename)
        rename(path.join(data_directory, csv_subdir, 'classes.txt'), classes_filename)

    classes = pd.read_csv(classes_filename, header=None, names=['class'])
    if classes_only:
        return classes
    train_raw = pd.read_csv(train_filename, header=None)
    test_raw = pd.read_csv(test_filename, header=None)
    longest_sent = max([len(sent) for sent in tf.contrib.learn.preprocessing.tokenizer(train_raw[2])])
    print("The longest sentence in the training data has {} words.".format(longest_sent))

    return train_raw, test_raw, classes


def extract_data(train_raw, test_raw):
    """Extract the document and class from each entry in the data."""
    x_train = train_raw[2]
    y_train = train_raw[0] - 1  # Start enumeration at 0 instead of 1
    x_test = test_raw[2]
    y_test = test_raw[0] - 1
    print('Size of training set: {0}'.format(len(x_train)))
    print('Size of test set: {0}'.format(len(x_test)))
    return x_train, np.array(y_train), x_test, np.array(y_test)

def process_vocabulary(train_sentences, test_sentences,
                       reuse=True, vocabulary_processor=None, extend=False, sequence_lengths=False):
    """Map words to integers, and then map sentences to integer sequences of length flags.max_doc_len, by truncating and
       padding as needed. This leads to an integer matrix of data which is what TensorFlow can work with. The processor
       is then saved to disk in a file determined by flags.

    Args:
       reuse: if True load the vocabulary_processor is loaded from disk if the file exists.
       vocabulary_processor: if not None, and it was not loaded from disk, the passed vocabulary_processor is used.
       extend: if True the vocabulary processor (loaded or passed) is extended.
       sequence_lengths: Whether to list the length of each document.
    """
    
    vocabulary_processor_path = path.join(MODEL_DIRECTORY, VOCAB_PROCESSOR_FILENAME)
    # If vocabulary_processor gets created/altered save it.
    if reuse and path.isfile(vocabulary_processor_path):
        vocabulary_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(vocabulary_processor_path)
        save_vocab_processor = extend
    elif vocabulary_processor is None:
        vocabulary_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
        vocabulary_processor.fit(train_sentences)
        save_vocab_processor = True
    elif extend:
        vocabulary_processor.vocabulary_.freeze(False)
        vocabulary_processor.fit(train_sentences)
        save_vocab_processor = True
    else:
        save_vocab_processor = False

    if train_sentences is not None:
        train_bow = np.array(list(vocabulary_processor.transform(train_sentences)))
    else:
        train_bow = None
    if test_sentences is not None:
        test_bow = np.array(list(vocabulary_processor.transform(test_sentences)))
    else:
        test_bow = None
    n_words = len(vocabulary_processor.vocabulary_)
    print('Number of words in vocabulary: %d' % n_words)

    if save_vocab_processor:
        if not path.isdir(MODEL_DIRECTORY):
            makedirs(MODEL_DIRECTORY)
        vocabulary_processor.save(vocabulary_processor_path)

    if sequence_lengths:
        def calculate_lengths(arr):
            return arr.shape[1] - (arr != 0)[:, ::-1].argmax(axis=1)
        train_lengths = calculate_lengths(train_bow) if train_bow is not None else None
        test_lengths = calculate_lengths(test_bow) if test_bow is not None else None
    else:
        train_lengths = test_lengths = None

    return train_bow, test_bow, train_lengths, test_lengths, vocabulary_processor, n_words
    


In [None]:
def preprocess_data(sequence_lengths=False):
    '''
    Load data, shuffle it, process the vocabulary and save to DATA_FILENAME, if not done already.
    Returns processed data. NOTE: If the max_doc_len changes from a previous run,
    then DATA_FILENAME should be deleted so that it can be properly recreated.
    '''
    preprocessed_path = path.join(MODEL_DIRECTORY, DATA_FILENAME)
    if path.isfile(preprocessed_path):
        with open(preprocessed_path, 'rb') as f:
            train_raw, x_train, y_train, x_test, y_test, \
            train_lengths, test_lengths, classes = pickle.load(f)
    else:
        #Get the raw data, downloading if neccessary 
        train_raw, test_raw, classes = get_data(DATA_DIRECTORY)
        
        #Seeding is neccessary for reproducability
        np.random.seed(TF_SEED)
        
        # Shuffle data to make the distribution of classes roughly stratified for each mini-batch.
        # This is not necessary for full batch training, but is essential for mini-batch training.
        train_raw = shuffle(train_raw)
        test_raw = shuffle(test_raw)
        train_sentences, y_train, test_sentences, y_test = extract_data(train_raw, test_raw)
        # Encode the raw data as integer vectors.
        x_train, x_test, train_lengths, test_lengths, _, _ = process_vocabulary(
            train_sentences, test_sentences,
            reuse=True, sequence_lengths=sequence_lengths)
        # Save the processed data to avoid re-processing.
        saved = False
        with open(preprocessed_path, 'wb') as f:
            try:
                pickle.dump([train_raw, x_train, y_train, x_test, y_test,
                             train_lengths, test_lengths, classes], f)
                saved = True
            except (OverflowError, MemoryError):
                # Can happen if max-doc-len is large.
                pass

        if not saved:
            remove(preprocessed_path)

    return train_raw, x_train, y_train, x_test, y_test, train_lengths, test_lengths, classes

In [None]:
"""
Modelling: Training, evaluation and prediction. Also metadata for TensorBoard.
"""


def input_fn(x, y=None, lengths=None, batch_size=None, num_epochs=None, shuffle=False):
    """Generic input function to be used as the input_fn arguments for Experiment or directly with Estimators."""
    if batch_size is None and x is not None:
        batch_size = len(x)
    x_dict = {WORDS_FEATURE: x}
    if lengths is not None:
        x_dict['LENGTHS_FEATURE'] = lengths
    return tf.estimator.inputs.numpy_input_fn(
        x_dict,
        y,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle)


def run_experiment(x_train, y_train, x_dev, y_dev, model_fn, schedule, output_dim, train_lengths=None, dev_lengths=None):
    """Create experiment object and run it."""
    hparams = tf.contrib.training.HParams(
        n_words=MAX_VOCABULARY_SIZE,
        n_epochs=NUM_EPOCHS,
        seed=TF_SEED,
        batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        output_dim=output_dim,
        embed_dim=50)

    is_training = schedule in ['train', 'train_and_evaluate']
    run_config = tf.contrib.learn.RunConfig()
    try:
        checkpoint_steps = len(x_train) / CHECKPOINTS_PER_EPOCH / BATCH_SIZE if is_training else None
        log_step_count_steps = 100  # default value
    except TypeError:
        # Happens if batch_size is None
        checkpoint_steps = 1
        log_step_count_steps = 1
    run_config = run_config.replace(model_dir=MODEL_DIRECTORY,
                                    save_checkpoints_steps=checkpoint_steps,
                                    log_step_count_steps=log_step_count_steps,
                                    tf_random_seed=hparams.seed)

    def experiment_fn(run_config, hparams):
        estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            config=run_config,
            params=hparams)
        experiment = tf.contrib.learn.Experiment(
            estimator=estimator,
            train_input_fn=input_fn(x_train, y_train, train_lengths,
                                    batch_size=hparams.batch_size,
                                    num_epochs=hparams.n_epochs,
                                    shuffle=True),
            eval_input_fn=input_fn(x_dev, y_dev, dev_lengths,
                                   num_epochs=1),
            eval_delay_secs=0)
        return experiment

    if is_training:
        print('Training model for {} epochs...'.format(hparams.n_epochs))
    tf.contrib.learn.learn_runner.run(
        experiment_fn=experiment_fn,
        run_config=run_config,
        schedule=schedule,  # What to run, e.g. "train_and_evaluate", "evaluate", ...
        hparams=hparams)  # hyperparameters


In [None]:
def estimator_spec_for_softmax_classification(logits, labels, mode, params):
    """Returns EstimatorSpec instance for softmax classification."""
    predicted_class = tf.argmax(logits, 1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'class': predicted_class,
                'prob': tf.nn.softmax(logits)
            })

    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    if mode == tf.estimator.ModeKeys.TRAIN:
        with tf.name_scope('OptimizeLoss'):
            optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
            train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # mode == EVAL
    eval_metric_ops = {
        'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_class)
    }
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

def bag_of_words_MLP_model(features, labels, mode, params):
    """MLP architecture"""
    with tf.variable_scope('MLP'):
        bow_column = tf.feature_column.categorical_column_with_identity(
            WORDS_FEATURE, num_buckets=params.n_words)
        bow_embedding_column = tf.feature_column.embedding_column(
            bow_column, dimension=params.embed_dim)
        bow = tf.feature_column.input_layer(
            features,
            feature_columns=[bow_embedding_column])
        bow_activated = tf.nn.relu(bow)
        logits = tf.layers.dense(bow_activated, params.output_dim, activation=None)

    return estimator_spec_for_softmax_classification(logits, labels, mode, params)

In [None]:
tic()
train_raw, x_train, y_train, x_test, y_test, _, _, classes = preprocess_data()
output_dim = len(classes)
run_experiment(x_train, y_train, x_test, y_test,
                   bag_of_words_MLP_model, 'train_and_evaluate', output_dim)
toc()

### 