#!/usr/bin/env python
This demo implements FastText[1] for sentence classification.
FastText is a simple model for text classification with performance often close
to state-of-the-art, and is useful as a solid baseline.
There are some important differences between this implementation and what
is described in the paper. Instead of Hogwild! SGD[2], we use Adam optimizer
with mini-batches. Hierarchical softmax is also not supported; if you have
a large label space, consider utilizing candidate sampling methods provided
by TensorFlow[3].
After 5 epochs, you should get test accuracy close to 90.9%.
import array
import hashlib
import time
import numpy as np
import tensorflow as tf
import tensorlayer as tl
from tensorlayer.layers import *
# Hashed n-grams with 1 < n <= N_GRAM are included as features
# in addition to unigrams.
N_GRAM = 2
# Size of vocabulary; less frequent words will be treated as "unknown"
VOCAB_SIZE = 100000
# Number of buckets used for hashing n-grams
N_BUCKETS = 1000000
# Size of the embedding vectors
# Number of epochs for which the model is trained
# Size of training mini-batches
# Path to which to save the trained model
MODEL_FILE_PATH = 'model.npz'
class FastTextClassifier(object):
"""Simple wrapper class for creating the graph of FastText classifier."""
def __init__(self, vocab_size, embedding_size, n_labels):
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.n_labels = n_labels
self.inputs = tf.placeholder(tf.int32, shape=[None, None], name='inputs')
self.labels = tf.placeholder(tf.int32, shape=[None], name='labels')
# Network structure
network = AverageEmbeddingInputlayer(self.inputs, self.vocab_size, self.embedding_size) = DenseLayer(network, self.n_labels)
# Training operation
cost = tl.cost.cross_entropy(, self.labels, name='cost')
self.train_op = tf.train.AdamOptimizer().minimize(cost)
# Predictions
self.prediction_probs = tf.nn.softmax(
self.predictions = tf.argmax(, axis=1, output_type=tf.int32)
# self.predictions = tf.cast(tf.argmax( # for TF < 1.2
#, axis=1), tf.int32)
# Evaluation
are_predictions_correct = tf.equal(self.predictions, self.labels)
self.accuracy = tf.reduce_mean(tf.cast(are_predictions_correct, tf.float32))
def save(self, sess, filename):
tl.files.save_npz(, name=filename, sess=sess)
def load(self, sess, filename):
tl.files.load_and_assign_npz(sess, name=filename,
def augment_with_ngrams(unigrams, unigram_vocab_size, n_buckets, n=2):
"""Augment unigram features with hashed n-gram features."""
def get_ngrams(n):
return list(zip(*[unigrams[i:] for i in range(n)]))
def hash_ngram(ngram):
bytes_ = array.array('L', ngram).tobytes()
hash_ = int(hashlib.sha256(bytes_).hexdigest(), 16)
return unigram_vocab_size + hash_ % n_buckets
return unigrams + [hash_ngram(ngram) for i in range(2, n + 1) for ngram in get_ngrams(i)]
def load_and_preprocess_imdb_data(n_gram=None):
"""Load IMDb data and augment with hashed n-gram features."""
X_train, y_train, X_test, y_test = tl.files.load_imdb_dataset(nb_words=VOCAB_SIZE)
if n_gram is not None:
X_train = np.array([augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n=n_gram) for x in X_train])
X_test = np.array([augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n=n_gram) for x in X_test])
return X_train, y_train, X_test, y_test
def train_test_and_save_model():
X_train, y_train, X_test, y_test = load_and_preprocess_imdb_data(N_GRAM)
classifier = FastTextClassifier(
vocab_size=VOCAB_SIZE + N_BUCKETS,
with tf.Session() as sess:
for epoch in range(N_EPOCH):
start_time = time.time()
print('Epoch %d/%d' % (epoch + 1, N_EPOCH))
for X_batch, y_batch in tl.iterate.minibatches(X_train, y_train, batch_size=BATCH_SIZE, shuffle=True):
classifier.train_op, feed_dict={
classifier.inputs: tl.prepro.pad_sequences(X_batch),
classifier.labels: y_batch,
print(" took %.5fs" % (time.time() - start_time))
test_accuracy =
classifier.accuracy, feed_dict={
classifier.inputs: tl.prepro.pad_sequences(X_test),
classifier.labels: y_test,
print('Test accuracy: %.5f' % test_accuracy), MODEL_FILE_PATH)
if __name__ == '__main__':