In [15]:
# Import libraries.
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.contrib import learn
from collections import defaultdict
import os
import time
import datetime
from IPython.display import clear_output

In [2]:
file_path = '/share/yelp/yelp_academic_dataset_review.csv'
df = pd.read_csv(file_path, usecols=['text','stars'])

In [3]:
#function to clean the data
def clean_string(string):
    string = string.lower()
    # remove web addresses
    string = re.sub(r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", 
                    " ", string)
    string = re.sub(r"[^a-z0-9!\?\']", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string

In [4]:
# clean up the text to only use ascii characters
df['text_cleaned'] = df.text.apply(clean_string)

In [5]:
# dump reviews with less than 5 characters remaining after cleaning
# this will remove the handfull of foreign language reviews with non-english characters
df = df[df.text_cleaned.str.len()>5]

In [None]:
# push data to lists.
review_count = len(df.index)
# split each review into word token using the space as the separator
reviews = df['text_cleaned'].str.split(' ').tolist()
stars = df['stars'].tolist()

# remove the data frame to free up memory
del df

In [3]:
# create master vocabulary set with a count of occurance
vocab = defaultdict(int)

for i,review in enumerate(reviews):
    for w in review:
        vocab[w] += 1

In [4]:
# check the size of the vocabulary to decide if we need to limit based
# on the frequency of word usage.
print('Size of vocab: %d words' %len(vocab.keys()))
print('Size of vocab with occurance > 5: %d words' %  len([v for v in vocab.values() if v>5]))

Size of vocab: 99056 words
Size of vocab with occurance > 5: 28685 words


In [5]:
# possibly remove some of the low-frequency words
# set a cuttoff value of 5, might change to a percent later
words_to_remove = set([k for k,v in vocab.items() if v<=5])
for w in words_to_remove:
    vocab.pop(w)

# creating a map from words to an integer between 0 and vocab size
# need to add '.' as well, the placeholder pad word

vocab_map = dict([(word,i) for i,word in enumerate(chain('.', vocab))])

# convert each review to a list of integers
reviews_mapped = [[vocab_map[word] for word in review if word in vocab_map] for review in reviews]

In [6]:
# pad review to be all of the same length for input into the NN
max_review_size = max(len(r) for r in reviews_mapped)

for i,r in enumerate(reviews_mapped):
    if len(r)<max_review_size:
        reviews_mapped[i] = r + [vocab_map['.']]*(max_review_size-len(r))

In [7]:
# converts stars to wide format
# the wide format would enable us to track probabilities of each class in the final output/model
# instead of just a predicted class
stars_wide = np.zeros((len(stars),5))

for i,s in enumerate(stars):
    stars_wide[i,s-1]=1

In [8]:
# Create training dataset and test dataset
review_count = len(reviews_mapped)
test_data = np.array(reviews_mapped[:int(review_count/3)])
test_labels = stars_wide[:int(review_count/3)]
train_data = np.array(reviews_mapped[int(review_count/3):])
train_labels = stars_wide[int(review_count/3):]

In [9]:
# Split training dataset to dev dataset and mini-train dataset
# the dev % is changed for memory issues
train_count = len(train_data)
dev_data = train_data[:int(train_count/20)]
dev_labels = train_labels[:int(train_count/20)]
mini_train_data = train_data[int(train_count/20):]
mini_train_labels = train_labels[int(train_count/20):]

In [11]:
class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    
    Baseline model based off of example from WildML (link below)
    http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(3, pooled_outputs)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

In [12]:
# creates batches of the data so everything is not stored in memory at once
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [None]:
#sets values for the default graph
#defined here to enable easier updating/modifications
dropout_keep_prob = 0.5
batch_size = 64
num_epochs = 200
evaluate_every = 100
checkpoint_every = 500
vocab_processor = learn.preprocessing.VocabularyProcessor(max_review_size)

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=max_review_size,
            num_classes=5,
            vocab_size=len(vocab_map),
            embedding_size=128,
            filter_sizes=[3,4,5],
            num_filters=3)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.merge_summary(grad_summaries)


        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.scalar_summary("loss", cnn.loss)
        acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.all_variables())

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.initialize_all_variables())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        #curretnly using the mini set and dev set
        batches = batch_iter(list(zip(mini_train_data, mini_train_labels)), batch_size, num_epochs)
        dev_batches = batch_iter(list(zip(dev_data, dev_labels)), 200, 1)
        
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % evaluate_every == 0:
                clear_output()
                print("\nEvaluation:")
                dev_batches = batch_iter(list(zip(dev_data, dev_labels)), 200, 1)
                for dev_batch in dev_batches:
                    dev_x_batch, dev_y_batch = zip(*dev_batch)
                    dev_step(dev_x_batch, dev_y_batch, writer=dev_summary_writer)
                clear_output()
            if current_step % checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))