In [1]:
from os import path, makedirs, rename, remove
from time import time
import argparse
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.utils import shuffle

In [2]:
DATA_DIRECTORY = 'data'
MAX_DOCUMENT_LENGTH = 10
MAX_VOCABULARY_SIZE = 1000000
EMBEDDING_DIM = 25
TF_SEED = 4242
NP_SEED = 1234
CHECKPOINTS_PER_EPOCH = 5
WORD_METADATA_FILENAME = 'word_metadata.tsv'
VOCAB_PROCESSOR_FILENAME = 'vocab_processor.pickle'
DATA_FILENAME = 'data.pickle'
VERBOSITY = 'info'
WORDS_FEATURE = 'words'  # Name of the input words feature.
LENGTHS_FEATURE = 'lengths'  # Name of the document lengths feature (not used for BOW)

In [5]:
"""
Timing functions (MATLAB style)
"""
_tstart_stack = []
def tic():
    _tstart_stack.append(time())


def toc(fmt="Elapsed: %.2f s"):
    print(fmt % (time() - _tstart_stack.pop()))

In [6]:
def get_data(data_directory, classes_only=False):
    """Download the DBpedia data if necessary, and load data from the data_directory. If the files train.csv, test.csv
       and classes.txt are all in data_directory, then they are used (no download)."""
    # The function call load_dataset in the TensorFlow API is supposed to provide this functionality. However, there are
    # currently issues: https://github.com/tensorflow/tensorflow/issues/14698

    train_filename = path.join(data_directory, 'train.csv')
    test_filename = path.join(data_directory, 'test.csv')
    classes_filename = path.join(data_directory, 'classes.txt')
    has_train = path.isfile(train_filename)
    has_test = path.isfile(test_filename)
    has_classes = path.isfile(classes_filename)

    if not has_train or not has_test or not has_classes:
        # Download the data if necessary, using the API.
        tf.contrib.learn.datasets.text_datasets.maybe_download_dbpedia(data_directory)
        csv_subdir = 'dbpedia_csv'

        if has_train:
            remove(train_filename)
        rename(path.join(data_directory, csv_subdir, 'train.csv'), train_filename)
        if has_test:
            remove(test_filename)
        rename(path.join(data_directory, csv_subdir, 'test.csv'), test_filename)
        if has_classes:
            remove(classes_filename)
        rename(path.join(data_directory, csv_subdir, 'classes.txt'), classes_filename)

    classes = pd.read_csv(classes_filename, header=None, names=['class'])
    if classes_only:
        return classes
    train_raw = pd.read_csv(train_filename, header=None)
    test_raw = pd.read_csv(test_filename, header=None)
    longest_sent = max([len(sent) for sent in tf.contrib.learn.preprocessing.tokenizer(train_raw[2])])
    print("The longest sentence in the training data has {} words.".format(longest_sent))

    return train_raw, test_raw, classes


def extract_data(train_raw, test_raw):
    """Extract the document and class from each entry in the data."""
    x_train = train_raw[2]
    y_train = train_raw[0] - 1  # Start enumeration at 0 instead of 1
    x_test = test_raw[2]
    y_test = test_raw[0] - 1
    print('Size of training set: {0}'.format(len(x_train)))
    print('Size of test set: {0}'.format(len(x_test)))
    return x_train, np.array(y_train), x_test, np.array(y_test)

def process_vocabulary(train_sentences, test_sentences, flags,
                       reuse=True, vocabulary_processor=None, extend=False, sequence_lengths=False):
    """Map words to integers, and then map sentences to integer sequences of length flags.max_doc_len, by truncating and
       padding as needed. This leads to an integer matrix of data which is what TensorFlow can work with. The processor
       is then saved to disk in a file determined by flags.

    Args:
       reuse: if True load the vocabulary_processor is loaded from disk if the file exists.
       vocabulary_processor: if not None, and it was not loaded from disk, the passed vocabulary_processor is used.
       extend: if True the vocabulary processor (loaded or passed) is extended.
       sequence_lengths: Whether to list the length of each document.
    """
    
    vocabulary_processor_path = path.join(flags.model_dir, flags.vocab_processor_file)
    # If vocabulary_processor gets created/altered save it.
    if reuse and path.isfile(vocabulary_processor_path):
        vocabulary_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(vocabulary_processor_path)
        save_vocab_processor = extend
    elif vocabulary_processor is None:
        vocabulary_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(flags.max_doc_len)
        vocabulary_processor.fit(train_sentences)
        save_vocab_processor = True
    elif extend:
        vocabulary_processor.vocabulary_.freeze(False)
        vocabulary_processor.fit(train_sentences)
        save_vocab_processor = True
    else:
        save_vocab_processor = False

    if train_sentences is not None:
        train_bow = np.array(list(vocabulary_processor.transform(train_sentences)))
    else:
        train_bow = None
    if test_sentences is not None:
        test_bow = np.array(list(vocabulary_processor.transform(test_sentences)))
    else:
        test_bow = None
    n_words = len(vocabulary_processor.vocabulary_)
    print('Number of words in vocabulary: %d' % n_words)

    if save_vocab_processor:
        if not path.isdir(flags.model_dir):
            makedirs(flags.model_dir)
        vocabulary_processor.save(vocabulary_processor_path)

    if sequence_lengths:
        def calculate_lengths(arr):
            return arr.shape[1] - (arr != 0)[:, ::-1].argmax(axis=1)
        train_lengths = calculate_lengths(train_bow) if train_bow is not None else None
        test_lengths = calculate_lengths(test_bow) if test_bow is not None else None
    else:
        train_lengths = test_lengths = None

    return train_bow, test_bow, train_lengths, test_lengths, vocabulary_processor, n_words
    


In [7]:
def preprocess_data(flags, sequence_lengths=False):
    '''
    Load data, shuffle it, process the vocabulary and save to DATA_FILENAME, if not done already.
    Returns processed data. NOTE: If the max_doc_len changes from a previous run,
    then DATA_FILENAME should be deleted so that it can be properly recreated.
    '''
    preprocessed_path = path.join(flags.model_dir, DATA_FILENAME)
    if path.isfile(preprocessed_path):
        with open(preprocessed_path, 'rb') as f:
            train_raw, x_train, y_train, x_test, y_test, \
            train_lengths, test_lengths, classes = pickle.load(f)
    else:
        #Get the raw data, downloading if neccessary 
        train_raw, test_raw, classes = get_data(flags.data_dir)
        
        #Seeding is neccessary for reproducability
        np.random.seed(flags.np_seed)
        
        # Shuffle data to make the distribution of classes roughly stratified for each mini-batch.
        # This is not necessary for full batch training, but is essential for mini-batch training.
        train_raw = shuffle(train_raw)
        test_raw = shuffle(test_raw)
        train_sentences, y_train, test_sentences, y_test = extract_data(train_raw, test_raw)
        # Encode the raw data as integer vectors.
        x_train, x_test, train_lengths, test_lengths, _, _ = process_vocabulary(
            train_sentences, test_sentences, flags,
            reuse=True, sequence_lengths=sequence_lengths)
        # Save the processed data to avoid re-processing.
        saved = False
        with open(preprocessed_path, 'wb') as f:
            try:
                pickle.dump([train_raw, x_train, y_train, x_test, y_test,
                             train_lengths, test_lengths, classes], f)
                saved = True
            except (OverflowError, MemoryError):
                # Can happen if max-doc-len is large.
                pass

        if not saved:
            remove(preprocessed_path)

    return train_raw, x_train, y_train, x_test, y_test, train_lengths, test_lengths, classes