In [None]:
# Import libraries.
import re
import numpy as np
import pandas as pd
import csv
import tensorflow as tf
from tensorflow.contrib import learn
from collections import defaultdict
import os
import time
import datetime
from datetime import timedelta
from IPython.display import clear_output
from itertools import chain
import pickle

In [None]:
#function to clean the data
def clean_string(string):
    string = string.lower()
    # remove web addresses
    string = re.sub(r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", 
                    " ", string)
    string = re.sub(r"[^a-z0-9!\?\']", " ", string)
    string = re.sub(r"'s", " 's", string)
    string = re.sub(r"s' ", " 's", string)
    string = re.sub(r"'ve", " 've", string)
    string = re.sub(r"n't", " n't", string)
    string = re.sub(r"'re", " 're", string)
    string = re.sub(r"'d", " 'd", string)
    string = re.sub(r"'ll", " 'll", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

In [None]:
# clean up the text to only use ascii characters
def clean_file(file_path, print_every=20000, write_out=True):
    """
    Read through the CSV file and generate a dictionary of words
    with counts and [optional] create a new cleaned file.
    
    Returns word_count dictionary, and cleaned file path
    """
    word_count = defaultdict(int)
    start = time.time()
    T0 = time.time()
    if write_out:
        p, e = os.path.splitext(file_path)
        out_path = p + '_clean' + e
        fp_out = open(out_path, 'w', encoding='utf-8')
        fp_out.write('review,stars\n')

    with open(file_path, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp)
        header = next(reader)
        text_index = header.index('text')
        star_index = header.index('stars')

        for i,line in enumerate(reader):
            review_text = clean_string(line[text_index])
            stars = line[star_index]
            if write_out and len(review_text)>0:
                fp_out.write(review_text+','+stars+'\n')
            review = review_text.split(' ')
            for word in review:
                word_count[word] += 1
            if i%print_every == 0:
                if write_out:
                    fp_out.flush()
                print('{} reviews completed, dictionary size: {}, cycle time: {:.1f} seconds.'.format(
                    i, len(word_count), time.time()-T0))
                T0 = time.time()
    print('{} reviews completed.  Dictionary size: {}'.format(i, len(word_count)))
    print('Total time: {}'.format(str(timedelta(seconds=round(time.time()-start)))))
    if write_out:
        fp_out.close()
    return word_count, out_path


In [None]:
def get_wordcount(cleaned_file_path):
    """
    Generates a dictionary of {word, count} from a file that has already
    been cleaned.
    """
    word_count = defaultdict(int)
    with open(cleaned_file_path, 'r', encoding='utf-8') as fp:
        header = next(fp)
        for i,line in enumerate(fp):
            review_text, stars = line.split(',')
            for word in review_text.split(' '):
                word_count[word] += 1
            if i%100000 == 0:
                print('Words in {} reviews counted.'.format(i))
    return word_count

In [None]:
def create_vocab(word_count, remove_lowest_fraction=0.01):
    """
    Creates a vocabulary by removing the bottom `remove_lowest_fraction` 
    of the words passed in via `word_count`.
    : inputs : 
    word_count: a diction of {word, int} pairs
    remove_lowest_fraction: the loweset occuring fraction out of 1.0 to
        drop from the vocab

    : output :
    vocab : a dictionary of {word, index} to to use for mapping words to ints
    """
    total_words = sum(list(word_count.values()))
    threshold = 0
    sum_below_threshold = 0
    while float(sum_below_threshold) / total_words < remove_lowest_fraction:
        threshold += 1
        sum_below_threshold = sum([v for v in word_count.values() if v<=threshold])

    print('Word frequency threshold = {} at {:.3f}% percent of total words.'.format(threshold,
        100*sum_below_threshold/total_words))
    print('Vocab size at 100% = {}, vocab size at {:.3f}% = {}'.format(
        len(word_count), 
        100*(1-sum_below_threshold/total_words),
        len([w for w,v in word_count.items() if v > threshold])))

    ## create a list of all word above that threshold
    ## create a dictionary that maps all valid words to a number
    vocab_list = sorted(word_count, key=word_count.get, reverse=True)  #a list
    # include '.' which is not in word_count so that 0 is open to use a filler
    vocab = {}
    for i,w in enumerate(chain(('.', '_RARE_'), vocab_list)):
        if w in ('.', '_RARE_'):
            vocab[w] = i
        elif word_count.get(w) > threshold:
            vocab[w] = i
    return vocab

In [None]:
def remove_nonvocab_words(cleaned_file_path, vocab):
    """
    Reads through cleaned file and remove words not in `vocab` and 
    removes reviews with zero remaining words.

    Writes a new file to ``CLEAN_FILENAME``+``_vocab_only``.csv

    Returns a tuple the number of words in the longest review, the
    number of reviews, and cleaned and vocab-only file path.
    """
    p, e = os.path.splitext(cleaned_file_path)
    out_path = p + '_vocab_only' + e
    longest_review_length = 0
    number_reviews = 0
    with open(cleaned_file_path,'r') as fp, open(out_path, 'w', encoding='utf-8') as fp_out:
        #move header to new file
        fp_out.write(next(fp))
        for i, line in enumerate(fp):
            review_text, stars = line.split(',')
            review = [w if w in vocab else '_RARE_' for w in review_text.split(' ')]
            if longest_review_length < len(review):
                longest_review_length = len(review)
            if len(review) > 0:
                fp_out.write(' '.join(review)+','+stars)
                number_reviews += 1
            if i%250000 == 0:
                print('{} reviews cleaned and removed non-vocab words.'.format(i))
        print('{} reviews cleaned and removed non-vocab words.'.format(i))
    return longest_review_length, number_reviews, out_path

In [None]:
def get_wordcount(cleaned_file_path):
    """
    Generates a dictionary of {word, count} from a file that has already
    been cleaned.
    """
    longest_review_length = 0
    word_count = defaultdict(int)
    with open(cleaned_file_path, 'r', encoding='utf-8') as fp:
        header = next(fp)
        for rev_count, line in enumerate(fp):
            review_text, stars = line.split(',')
            review = review_text.split(' ')
            if longest_review_length < len(review):
                longest_review_length = len(review)
            for word in review:
                word_count[word] += 1
            if rev_count%200000 == 0:
                print('Words in {} reviews counted.'.format(rev_count))
    print('Words in {} reviews counted.'.format(rev_count))
    return dict(word_count), rev_count, longest_review_length

In [None]:
def text_to_arrays(file_path, vocab, max_review_length, number_reviews):
    """
    Warning: this will need about 5 GB of RAM.

    : inputs :
    file_path: [string] the path to cleaned csv with non-vocab words already 
               removed
    vocab: [dictionary] mapping of {word, int} from words to integers
    max_review_length: [int] the number of words in the longest review 
    number_reviews: [int] total number of reviews

    : outputs :
    r_arr: [np.array] a 2D integer array representation of the reviews. The
           space for extra words are padded with zeros
    s_arr: [np.array] a 2D integer array of the star ratings.
    """
    r_arr = np.zeros((number_reviews, longest_review), dtype=np.int16)
    s_arr = np.zeros((number_reviews, 5), dtype=np.int8)
    with open(file_path, 'r') as fp:
        header = next(fp)
        for i, line in enumerate(fp):
            review, stars = line.split(',')
            rev = [vocab[w] for w in review.split(' ')] 
            stars = int(stars)
            r_arr[i,:len(rev)] = rev
            s_arr[i,stars-1] = 1
    return r_arr, s_arr

In [None]:
wc, num_rev, seq_len = get_wordcount('/share/yelp/yelp_academic_dataset_review_clean.csv')

In [None]:
vocab = create_vocab(wc)

In [None]:
with open('/share/yelp/params_voc_seq_num.pickle', 'wb') as fp:
    pickle.dump((vocab, seq_len, num_rev), fp)

In [None]:
seq, num, p = remove_nonvocab_words('/share/yelp/yelp_academic_dataset_review_clean.csv', vocab)

In [None]:
class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    
    Baseline model based off of example from WildML (link below)
    http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(3, pooled_outputs)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

In [None]:
class FileBatcher(object):
    """
    An object to create batches of data from a file. 
    """
    vocab = {}
    sequence_length = None
    num_items = None
    split_line = None
    
    def __init__(self, file_path, split_ratio):
        """
        Opens a file for reading using a vocabulary to translate the words
        to numbers.  The test ratio is the fraction of data to withhold
        for evaluation.
        """                
        self.file_path = file_path
        self.split_ratio = split_ratio
    
    def load_params(self, vocab, sequence_length, num_items):
        """
        Input the vocabulary, maximum sequency length, and total number
        of item in the file.
        """
        assert isinstance(vocab, dict), '`vocab` must be a dictionary.'
        self.vocab = vocab
        self.sequence_length = sequence_length
        self.num_items = num_items
        self.split_line = int(num_items*self.split_ratio)
        
    def get_params(self, verbose=True):
        """
        Reads the file at `file_path` and sets the .vocab,
        .sequence_length, and .num_items paramters.
        """
        from itertools import chain
        from collections import defaultdict
        
        SL = 0
        v = defaultdict(int)
        with open(self.file_path) as fp:
            for count, line in enumerate(fp):
                rev_txt, star = line.strip().split(',')
                rev = rev_txt.split()
                for w in rev:
                    v[w] += 1
                SL = len(rev) if len(rev)>SL else SL
                if verbose:
                    if count%100000 == 0:
                        print('Total lines processed: {:,}, vocab size: {}'
                               .format(count, len(v)))
        self.vocab = dict((w,i) for i,w in 
            enumerate(chain(' ',sorted(v, key=v.get, reverse=True))))
        self.sequence_length = SL
        self.num_items = count
        self.split_line = int(count*self.split_ratio)
    
    def _line_parser(self, line):
        """
        Uses .vocab to convert a list of words to integers padded
        out to .sequence_length.  
        """
        # split the line
        rev_txt, stars = line.strip().split(',')
        # convert stars to wide format
        stars = int(stars)
        stars_wide = [0]*(stars-1)+[1]+[0]*(5-stars)
        # convert words to ints
        rev = [self.vocab[w] for w in rev_txt.split(' ')]
        # pad the review
        rev_wide = rev + [0]*(self.sequence_length-len(rev))
        # return combined
        return rev_wide + stars_wide

    # creates batches of the data so everything is not stored in memory at once
    def batch_iter(self, batch_size, num_epochs, batch_pool_size=16000,
                   shuffle=True, split='none'):
        """
        Generates a batch iterator for a dataset from a file_path

        :inputs:
        batch_size - int, number of items per batch
        num_epochs - int, number of epochs to iterate over the data
        batch_pool_size - int, number of lines of data to use for pooling 
                          when creating the iterator
        shuffle - bool, whether to shuffle the pool before batching
        split - string {`first`|`last`|`none`}, 
        ::-- `first` uses up to the first `.split_ratio` fraction of the file lines.  
        ::-- `last` uses the lines beyond the first`.split_ratio`
        ::-- `none` uses the entire file
        """
        assert isinstance(split, str), 'Paramter `split` must be a string.'
        if split.lower()=='none':
            iter_size = self.num_items
        elif split.lower()=='first':
            iter_size = self.split_line
        elif split.lower()=='last':
            iter_size = self.num_items - self.split_line
        else:
            raise ValueError('Value of `split` must be in {"first","last","none"}')
            
        # get the number of batches and pool size of the leftovers
        number_pools, last_pool_size = divmod(iter_size, batch_pool_size)
        # combine the leftovers if it is less than 1/4 of the pool size
        if number_pools > 0 and last_pool_size < batch_pool_size/4:
            last_pool_size += batch_pool_size
        # add a batch pool for the left overs
        else:
            number_pools += 1

        for epoch in range(num_epochs):
            with open(self.file_path) as fp:
                # remove header lines
                header = next(fp)
                # move the file point to the correct starting location
                if split.lower()=='last':
                    for i in range(self.split_line):
                        null = next(fp)
                # pull in the next pool of batches
                for i in range(1,number_pools+1):
                    p_size = batch_pool_size if i<number_pools else last_pool_size
                    batch_pool = np.array([self._line_parser(next(fp)) 
                                           for j in range(p_size)], dtype=int)

                    # Shuffle the data at each epoch
                    if shuffle:
                        shuffle_indices = np.random.permutation(np.arange(p_size))
                        batch_pool = batch_pool[shuffle_indices]
                    
                    # set the number of iterations for each pool size
                    number_batches = int(np.ceil(p_size/float(batch_size)))
                    for batch_num in range(number_batches):
                        start_index = batch_num * batch_size
                        end_index = min((batch_num + 1) * batch_size, p_size)
                        
                        yield (batch_pool[start_index:end_index,:-5], 
                               batch_pool[start_index:end_index,-5:])

In [None]:
#sets values for the default graph
#defined here to enable easier updating/modifications
dropout_keep_prob = 0.5
batch_size = 64
num_epochs = 100
evaluate_every = 2000
checkpoint_every = 20000
dev_fraction = 0.001

# create batching object
with open('/share/yelp/params_voc_seq_num.pickle', 'rb') as fp:
    vocab, seq_len, num_lines = pickle.load(fp)
file_path = '/share/yelp/yelp_academic_dataset_review_clean_vocab_only.csv'
fb = FileBatcher(file_path, dev_fraction)
fb.load_params(vocab, seq_len, num_lines)
train_batches = fb.batch_iter(batch_size, num_epochs, split='last')

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=seq_len,
            num_classes=5,
            vocab_size=len(vocab),
            embedding_size=32,
            filter_sizes=[3,4,5],
            num_filters=128)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        # Keep track of gradient values and sparsity
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.merge_summary(grad_summaries)


        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.scalar_summary("loss", cnn.loss)
        acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.all_variables())

        # Initialize all variables
        sess.run(tf.initialize_all_variables())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)
        
        # Training loop. For each batch...
        for x_batch, y_batch in train_batches:
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step%50==0:
                clear_output()
            if (current_step > 0) and (current_step % evaluate_every == 0):
                dev_batches = fb.batch_iter(batch_size, num_epochs=1, shuffle=False, split='first')
                clear_output()
                print("\nEvaluation:")
                for d_i, (x_batch, y_batch) in enumerate(dev_batches):
                    dev_step(x_batch, y_batch, writer=dev_summary_writer)
                    if d_i%20==0:
                        clear_output()
                        print("\nEvaluation - cycle: {}".format(d_i))
                clear_output()
            if (current_step > 0) and (current_step % checkpoint_every == 0):
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

2016-12-13T18:30:31.252325: step 43951, loss 0.696623, acc 0.75
2016-12-13T18:30:32.797298: step 43952, loss 0.780235, acc 0.65625
2016-12-13T18:30:34.364364: step 43953, loss 0.775759, acc 0.65625
2016-12-13T18:30:36.051557: step 43954, loss 0.486717, acc 0.828125
2016-12-13T18:30:37.567994: step 43955, loss 0.77579, acc 0.671875
2016-12-13T18:30:39.195823: step 43956, loss 0.792433, acc 0.640625
2016-12-13T18:30:40.825692: step 43957, loss 0.747814, acc 0.671875
2016-12-13T18:30:42.479582: step 43958, loss 0.740036, acc 0.75
2016-12-13T18:30:44.066846: step 43959, loss 0.670348, acc 0.71875


In [None]:
#sets values for the default graph
#defined here to enable easier updating/modifications
dropout_keep_prob = 0.5
batch_size = 64
num_epochs = 100
evaluate_every = 2000
checkpoint_every = 20000
dev_fraction = 0.001

# read in params again
with open('/share/yelp/params_voc_seq_num.pickle', 'rb') as fp:
    vocab, seq_len, num_lines = pickle.load(fp)

# load up a very similar graph for evaluations
with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=seq_len,
            num_classes=5,
            vocab_size=len(vocab),
            embedding_size=32,
            filter_sizes=[3,4,5],
            num_filters=128)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        # Keep track of gradient values and sparsity
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.merge_summary(grad_summaries)

        # Summaries for loss and accuracy
        loss_summary = tf.scalar_summary("loss", cnn.loss)
        acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

        # Dev summaries
        dev_summary_op = tf.merge_summary([loss_summary, acc_summary])

        saver = tf.train.Saver(tf.all_variables())
