In [1]:
# Import libraries.
import re
import numpy as np
import pandas as pd
import csv
import tensorflow as tf
from tensorflow.contrib import learn
from collections import defaultdict
import os
import time
import datetime
from datetime import timedelta
from IPython.display import clear_output
from itertools import chain

In [None]:
file_path = 'data/yelp_academic_dataset_review.csv'

In [3]:
#function to clean the data
def clean_string(string):
    string = string.lower()
    # remove web addresses
    string = re.sub(r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", 
                    " ", string)
    string = re.sub(r"[^a-z0-9!\?\']", " ", string)
    string = re.sub(r"'s", " 's", string)
    string = re.sub(r"s' ", " 's", string)
    string = re.sub(r"'ve", " 've", string)
    string = re.sub(r"n't", " n't", string)
    string = re.sub(r"'re", " 're", string)
    string = re.sub(r"'d", " 'd", string)
    string = re.sub(r"'ll", " 'll", string)
    string = re.sub(r"!", " ! ", string)
    #string = re.sub(r"?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

In [4]:
# clean up the text to only use ascii characters
def clean_file(file_path, print_every=100000, write_out=True):
    """
    Read through the CSV file and generate a dictionary of words
    with counts and [optional] create a new cleaned file.
    
    Returns word_count dictionary, and cleaned file path
    """
    word_count = defaultdict(int)
    start = time.time()
    T0 = time.time()
    if write_out:
        p, e = os.path.splitext(file_path)
        out_path = p + '_clean' + e
        #fp_out = open(out_path, 'w', encoding='utf-8')
        fp_out = open(out_path, 'w')
        fp_out.write('review,stars\n')

    #with open(file_path, 'r', encoding='utf-8') as fp:
    with open(file_path, 'r') as fp:
        reader = csv.reader(fp)
        header = next(reader)
        text_index = header.index('text')
        star_index = header.index('stars')

        for i,line in enumerate(reader):
            review_text = clean_string(line[text_index])
            stars = line[star_index]
            if write_out and len(review_text)>0:
                fp_out.write(review_text+','+stars+'\n')
            review = review_text.split(' ')
            for word in review:
                word_count[word] += 1
            if i%print_every == 0:
                if write_out:
                    fp_out.flush()
                print('{} reviews completed, dictionary size: {}, cycle time: {:.1f} seconds.'.format(
                    i, len(word_count), time.time()-T0))
                T0 = time.time()
    print('{} reviews completed.  Dictionary size: {}'.format(i, len(word_count)))
    print('Total time: {}'.format(str(timedelta(seconds=round(time.time()-start)))))
    if write_out:
        fp_out.close()
    return word_count, out_path


In [5]:
def get_wordcount(cleaned_file_path):
    """
    Generates a dictionary of {word, count} from a file that has already
    been cleaned.
    """
    longest_review_length = 0
    word_count = defaultdict(int)
    with open(cleaned_file_path, 'r') as fp:
        #header = next(fp)
        for rev_count, line in enumerate(fp):
            review_text, stars = line.split(',')
            review = review_text.split(' ')
            if longest_review_length < len(review):
                longest_review_length = len(review)
            for word in review:
                word_count[word] += 1
            if rev_count%100000 == 0:
                print('Words in {} reviews counted.'.format(rev_count))
    return word_count, rev_count, longest_review_length

In [6]:
def create_vocab(word_count, remove_lowest_fraction=0.01):
    """
    Creates a vocabulary by removing the bottom `remove_lowest_fraction` 
    of the words passed in via `word_count`.
    : inputs : 
    word_count: a diction of {word, int} pairs
    remove_lowest_fraction: the loweset occuring fraction out of 1.0 to
        drop from the vocab

    : output :
    vocab : a dictionary of {word, index} to to use for mapping words to ints
    """
    total_words = sum(list(word_count.values()))
    threshold = 0
    sum_below_threshold = 0
    while float(sum_below_threshold) / total_words < remove_lowest_fraction:
        threshold += 1
        sum_below_threshold = sum([v for v in word_count.values() if v<=threshold])

    print('Word frequency threshold = {} at {:.3f}% percent of total words.'.format(threshold,
        100*sum_below_threshold/total_words))
    print('Vocab size at 100% = {}, vocab size at {:.3f}% = {}'.format(
        len(word_count), 
        (100-100.0*sum_below_threshold/total_words),
        len([w for w,v in word_count.items() if v > threshold])))

    ## create a list of all word above that threshold
    ## create a dictionary that maps all valid words to a number
    vocab = sorted(word_count, key=word_count.get, reverse=True)
    # include '.' which is not in word_count so that 0 is open to use a filler
    vocab = dict((v,i) for i,v in enumerate(chain('.', vocab)) 
                if word_count[v]>threshold)
    return vocab

In [7]:
def remove_nonvocab_words(cleaned_file_path, vocab):
    """
    Reads through cleaned file and remove words not in `vocab` and 
    removes reviews with zero remaining words.

    Writes a new file to ``CLEAN_FILENAME``+``_vocab_only``.csv

    Returns a tuple the number of words in the longest review, the
    number of reviews, and cleaned and vocab-only file path.
    """
    p, e = os.path.splitext(file_path)
    out_path = p + '_vocab_only' + e
    longest_review_length = 0
    number_reviews = 0
    #with open(cleaned_file_path,'r') as fp, open(out_path, 'w', encoding='utf-8') as fp_out:
    with open(cleaned_file_path,'r') as fp, open(out_path, 'w') as fp_out:
        fp_out.write(next(fp))
        for i, line in enumerate(fp):
            review_text, stars = line.split(',')
            review = [w for w in review_text.split(' ') if w in vocab]
            if longest_review_length < len(review):
                longest_review_length = len(review)
            if len(review) > 0:
                fp_out.write(' '.join(review)+','+stars)
                number_reviews += 1
            if i%100000 == 0:
                print('{} reviews cleaned and removed non-vocab words.'.format(i))
    return longest_review_length, number_reviews, out_path

In [8]:
def text_to_arrays(file_path, vocab, max_review_length, number_reviews):
    """
    Warning: this will need about 5 GB of RAM.

    : inputs :
    file_path: [string] the path to cleaned csv with non-vocab words already 
               removed
    vocab: [dictionary] mapping of {word, int} from words to integers
    max_review_length: [int] the number of words in the longest review 
    number_reviews: [int] total number of reviews

    : outputs :
    r_arr: [np.array] a 2D integer array representation of the reviews. The
           space for extra words are padded with zeros
    s_arr: [np.array] a 2D integer array of the star ratings.
    """
    r_arr = np.zeros((number_reviews, max_review_length), dtype=np.int16)
    s_arr = np.zeros((number_reviews, 5), dtype=np.int8)
    with open(file_path, 'r') as fp:
        header = next(fp)
        for i, line in enumerate(fp):
            review, stars = line.split(',')
            rev = [vocab[w] for w in review.split(' ')] 
            stars = int(stars)
            r_arr[i,:len(rev)] = rev
            s_arr[i,stars-1] = 1
    return r_arr, s_arr

In [9]:
word_count, cleaned_file_path = clean_file(file_path)

0 reviews completed, dictionary size: 62, cycle time: 0.0 seconds.
100000 reviews completed, dictionary size: 80002, cycle time: 21.4 seconds.
200000 reviews completed, dictionary size: 109027, cycle time: 20.2 seconds.
300000 reviews completed, dictionary size: 135672, cycle time: 21.0 seconds.
400000 reviews completed, dictionary size: 158039, cycle time: 21.0 seconds.
500000 reviews completed, dictionary size: 181024, cycle time: 22.3 seconds.
600000 reviews completed, dictionary size: 201110, cycle time: 23.5 seconds.
700000 reviews completed, dictionary size: 219979, cycle time: 24.0 seconds.
800000 reviews completed, dictionary size: 236165, cycle time: 22.5 seconds.
900000 reviews completed, dictionary size: 250889, cycle time: 20.8 seconds.
1000000 reviews completed, dictionary size: 264761, cycle time: 21.7 seconds.
1100000 reviews completed, dictionary size: 278684, cycle time: 22.5 seconds.
1200000 reviews completed, dictionary size: 306638, cycle time: 21.4 seconds.
1300000

In [10]:
vocab = create_vocab(word_count)

Word frequency threshold = 137 at 1.000% percent of total words.
Vocab size at 100% = 484991, vocab size at -100.059% = 25551


In [11]:
longest_rev_length, number_reviews, vocab_only_fp = remove_nonvocab_words(cleaned_file_path, vocab)

0 reviews cleaned and removed non-vocab words.
100000 reviews cleaned and removed non-vocab words.
200000 reviews cleaned and removed non-vocab words.
300000 reviews cleaned and removed non-vocab words.
400000 reviews cleaned and removed non-vocab words.
500000 reviews cleaned and removed non-vocab words.
600000 reviews cleaned and removed non-vocab words.
700000 reviews cleaned and removed non-vocab words.
800000 reviews cleaned and removed non-vocab words.
900000 reviews cleaned and removed non-vocab words.
1000000 reviews cleaned and removed non-vocab words.
1100000 reviews cleaned and removed non-vocab words.
1200000 reviews cleaned and removed non-vocab words.
1300000 reviews cleaned and removed non-vocab words.
1400000 reviews cleaned and removed non-vocab words.
1500000 reviews cleaned and removed non-vocab words.
1600000 reviews cleaned and removed non-vocab words.
1700000 reviews cleaned and removed non-vocab words.
1800000 reviews cleaned and removed non-vocab words.
1900000 

In [25]:
print longest_rev_length, number_reviews, vocab_only_fp

1094 2684822 data/yelp_academic_dataset_review_vocab_only.csv


In [16]:
# When the cleaned file is ready
#vocab, number_reviews,longest_rev_length = get_wordcount(vocab_only_fp)

Words in 0 reviews counted.
Words in 100000 reviews counted.
Words in 200000 reviews counted.
Words in 300000 reviews counted.
Words in 400000 reviews counted.
Words in 500000 reviews counted.
Words in 600000 reviews counted.
Words in 700000 reviews counted.
Words in 800000 reviews counted.
Words in 900000 reviews counted.
Words in 1000000 reviews counted.
Words in 1100000 reviews counted.
Words in 1200000 reviews counted.
Words in 1300000 reviews counted.
Words in 1400000 reviews counted.
Words in 1500000 reviews counted.
Words in 1600000 reviews counted.
Words in 1700000 reviews counted.
Words in 1800000 reviews counted.
Words in 1900000 reviews counted.
Words in 2000000 reviews counted.
Words in 2100000 reviews counted.
Words in 2200000 reviews counted.
Words in 2300000 reviews counted.
Words in 2400000 reviews counted.
Words in 2500000 reviews counted.
Words in 2600000 reviews counted.


In [12]:
reviews_mapped, stars_wide = text_to_arrays(vocab_only_fp, vocab, longest_rev_length, number_reviews)

In [14]:
# Save the pre-processed dataset
np.save('cleaned_vocab.npy', vocab)
np.save('reviews_mapped.npy', reviews_mapped)
np.save('stars_wide.npy', stars_wide)

In [2]:
# Load the pre-processed dataset
vocab = np.load('cleaned_vocab.npy').item()
reviews_mapped = np.load('reviews_mapped.npy')
stars_wide = np.load('stars_wide.npy')
longest_rev_length = 1094
number_reviews = 2684822

In [3]:
# Create training dataset and test dataset
review_count = len(reviews_mapped)
#test_data = reviews_mapped[:int(review_count/3)]
#test_labels = stars_wide[:int(review_count/3)]
train_data = reviews_mapped[int(review_count/3):]
train_labels = stars_wide[int(review_count/3):]
review_mapped = None
stars_wide = None

In [4]:
# Split training dataset to dev dataset and mini-train dataset
# the dev % is changed for memory issues
train_count = len(train_data)
dev_data = train_data[:int(train_count/20)]
dev_labels = train_labels[:int(train_count/20)]
mini_train_data = train_data[int(train_count/20):]
mini_train_labels = train_labels[int(train_count/20):]
train_data = None
train_labels = None

print 'The mini training dataset contains %d samples.' %(len(mini_train_labels))
print 'The dev dataset contains %d samples.' %(len(dev_labels)) 

print mini_train_data[0], mini_train_labels[0]

The mini training dataset contains 1700388 samples.
The dev dataset contains 89494 samples.
[ 791   13 2367 ...,    0    0    0] [0 0 0 0 1]


In [5]:
class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    
    Baseline model based off of example from WildML (link below)
    http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(3, pooled_outputs)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

In [6]:
# creates batches of the data so everything is not stored in memory at once
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data[start_index:end_index]

In [None]:
#sets values for the default graph
#defined here to enable easier updating/modifications
dropout_keep_prob = 0.5
batch_size = 1000  #64, 8192
num_epochs = 1 #200
evaluate_every = 10
checkpoint_every = 200
vocab_processor = learn.preprocessing.VocabularyProcessor(longest_rev_length)

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=longest_rev_length,
            num_classes=5,
            vocab_size=len(vocab)+1,
            embedding_size=128,
            filter_sizes=[3,4,5],
            num_filters=3)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.merge_summary(grad_summaries)


        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.scalar_summary("loss", cnn.loss)
        acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.all_variables())

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.initialize_all_variables())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        #curretnly using the mini set and dev set
        batches = batch_iter(list(zip(mini_train_data, mini_train_labels)), batch_size, num_epochs)
        #dev_batches = batch_iter(list(zip(dev_data, dev_labels)), 200, 1)
        
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % evaluate_every == 0:
                clear_output()
                print("\nEvaluation:")
                dev_batches = batch_iter(list(zip(dev_data, dev_labels)), batch_size, 1)
                for dev_batch in dev_batches:
                    dev_x_batch, dev_y_batch = zip(*dev_batch)
                    dev_step(dev_x_batch, dev_y_batch, writer=dev_summary_writer)
                #dev_step(dev_data, dev_labels, writer=dev_summary_writer)
                clear_output()
            if current_step % checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

2016-12-13T00:40:23.482611: step 501, loss 1.3798, acc 0.399
2016-12-13T00:40:58.576990: step 502, loss 1.26706, acc 0.479
2016-12-13T00:41:33.895657: step 503, loss 1.24166, acc 0.503
2016-12-13T00:42:09.242689: step 504, loss 1.37156, acc 0.404
2016-12-13T00:42:44.878295: step 505, loss 1.23941, acc 0.508


KeyboardInterrupt: 

In [26]:
checkpoint_dir="runs/1481516781/checkpoints/"
checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
batch_size = 1000
graph = tf.Graph()

In [30]:
with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)


        # Get the placeholders from the graph by name
        input_x = graph.get_operation_by_name("input_x").outputs[0]
        # input_y = graph.get_operation_by_name("input_y").outputs[0]
              
        dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name("output/predictions").outputs[0]
        

        # Generate batches for one epoch
        test_batches = batch_iter(list(test_data), batch_size, 1)

        # Collect the predictions here
        all_predictions = []

        for x_test_batch in [0,1,2]:
        #for x_test_batch in test_batches:

            batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
            all_predictions = np.concatenate([all_predictions, batch_predictions])


correct_predictions = float(sum(all_predictions == test_labels))
print("Total number of test examples: {}".format(len(test_labels)))
print("Accuracy: {:g}".format(correct_predictions/float(len(test_labels))))

# Save the evaluation to a csv
#predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
predictions_human_readable = np.column_stack((test_labels, all_predictions))
out_path = os.path.join(checkpoint_dir, "..", "prediction.csv")
print("Saving evaluation to {0}".format(out_path))
with open(out_path, 'w') as f:
    csv.writer(f).writerows(predictions_human_readable)
    

KeyError: "The name 'input_x' refers to an Operation not in the graph."