In [1]:
# Import libraries.
import re
import numpy as np
import pandas as pd
import csv
import tensorflow as tf
from tensorflow.contrib import learn
from collections import defaultdict
import os
import time
import datetime
from datetime import timedelta
from IPython.display import clear_output
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
# Load the data
csv_file = 'data/yelp_academic_dataset_review.csv'
review_data = pd.read_csv(csv_file)

In [12]:
review_count = len(review_data.index)
print 'The dataset contains %d reviews.' % review_count
print 'The dataset columns:'
print review_data.columns

The dataset contains 2685066 reviews.
The dataset columns:
Index([u'user_id', u'review_id', u'text', u'votes.cool', u'business_id',
       u'votes.funny', u'stars', u'date', u'type', u'votes.useful'],
      dtype='object')


In [13]:
# Create training dataset and test dataset

reviews = review_data['text'].tolist()
stars = review_data['stars'].tolist()
review_count = len(stars)
test_text, test_labels = reviews[:review_count/3], stars[:review_count/3]
train_text, train_labels = reviews[review_count/3:review_count*29/30], stars[review_count/3:review_count*29/30]
dev_text, dev_labels = reviews[:review_count/30], stars[:review_count/30]
print 'The training dataset contains %d review.' %(len(train_text))
print 'The test dataset contains %d review.' %(len(test_text))
print 'The dev dataset contains %d review.' %(len(dev_text))

print train_text[1], train_labels[1]

# Clear memory
review_data = None
reviews = None
stars = None

The training dataset contains 1700541 review.
The test dataset contains 895022 review.
The dev dataset contains 89502 review.
It's pretty simple. You take the test online, you pass, then you show up here and give them $15 and they take your pic and give you your TAM card. 
The online class is about 30-40 minutes. I guess it depends on how fast your read and your reading comprehension. You can also show up here and take the class as well. I recommend doing it online first. 3


In [32]:
def text_process(rawtext):

    # Remove special characters. Change to lower case.
    text = [re.sub(r"[^a-z0-9!\?\']", " ", s.lower()) for s in rawtext] 

    text = [re.sub(r"'s", " 's", s) for s in text]
    text = [re.sub(r"'ve", " 've", s) for s in text]
    text = [re.sub(r"n't", " n't", s) for s in text]
    text = [re.sub(r"'re", " 're", s) for s in text]
    text = [re.sub(r"!", " ! ", s) for s in text]
    text = [re.sub(r"\?", " \? ", s) for s in text]
    #text = [re.sub(r"\s{2,}", " ", s) for s in text]
    # Stemming the words
    #text = [re.sub(r'ing\b|tion\b|ly\b|ed\b|ious\b|ies\b|ive\b|es\b|s\b|ment\b|ingly\b|tions\b|ful\b|fully\b','', s) for s in text]        

    return text


In [33]:
train_text = text_process(train_text)
dev_text = text_process(dev_text)
#test_text = text_process(test_text)

print train_text[1]

it 's pretty simple  you take the test online  you pass  then you show up here and give them  15 and they take your pic and give you your tam card   the online class is about 30 40 minutes  i guess it depends on how fast your read and your reading comprehension  you can also show up here and take the class as well  i recommend doing it online first 


In [14]:
def label_process(labels):
  label_arr = np.zeros((len(labels), 5), dtype=np.int8)
  for i, star in enumerate(labels):
    label_arr[i, int(star)-1] = 1
  return label_arr

In [15]:
train_labels = label_process(train_labels)
dev_labels = label_process(dev_labels)
print train_labels[1], dev_labels[1]

[0 0 1 0 0] [0 0 0 0 1]


In [16]:
np.save('train_labels.npy', train_labels)
np.save('dev_labels.npy', dev_labels)

In [38]:
CV = CountVectorizer(stop_words = 'english', min_df = 0.00005)
CV.fit_transform(train_text)

<1700541x23918 sparse matrix of type '<type 'numpy.int64'>'
	with 71604346 stored elements in Compressed Sparse Row format>

In [40]:
words = CV.vocabulary_.keys()
words = [i.encode("utf-8") for i in words]

index_vocab = dict(enumerate(words))
vocab_index = {v:k for k,v in index_vocab.iteritems()}

np.save('index_vocab.npy', index_vocab)
np.save('vocab_index.npy', vocab_index)

CV = None
words = None

In [50]:
train_data = [[vocab_index.get(w) for w in r.split(' ')] for r in train_text]
train_data = [[w for w in r if w != None] for r in train_data]

dev_data = [[vocab_index.get(w) for w in r.split(' ')] for r in dev_text]
dev_data = [[w for w in r if w != None] for r in dev_data]

In [58]:
max_review_length = max(len(max(train_data, key=len)), len(max(dev_data, key=len)))
print max_review_length

620


In [89]:
def data_expand(data, max_review_length):
  review_count = len(data)
  data_arr = np.zeros((len(data), max_review_length), dtype=np.int16)
  for i in range(len(data)):
    data_arr[i, :len(data[i])] = data[i]
  return data_arr

In [94]:
train_data = data_expand(train_data, max_review_length)
dev_data = data_expand(dev_data, max_review_length)

[19053 10615 18210  2486 14942 22867  8613 11531  1731  2486  5247 16989
  3027 13008  7484  3943 10163 12691 16118  2836  5247  9821 14217  2486
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [95]:
np.save('train_data.npy', train_data)
np.save('dev_data.npy', dev_data)

In [2]:
vocab_index = np.load('vocab_index.npy').item()
dev_data = np.load('dev_data.npy')
train_data = np.load('train_data.npy')
dev_labels = np.load('dev_labels.npy')
train_labels = np.load('train_labels.npy')
max_review_length = 620

In [3]:
class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    
    Baseline model based off of example from WildML (link below)
    http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(3, pooled_outputs)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

In [4]:
# creates batches of the data so everything is not stored in memory at once
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data[start_index:end_index]

In [6]:
#sets values for the default graph
#defined here to enable easier updating/modifications
dropout_keep_prob = 0.5
batch_size = 500  #64, 8192
num_epochs = 1 #200
evaluate_every = 400 #200
checkpoint_every = 400 #200
vocab_processor = learn.preprocessing.VocabularyProcessor(max_review_length)

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=max_review_length,
            num_classes=5,
            vocab_size=len(vocab_index)+1,
            embedding_size=128,
            filter_sizes=[3,4,5],
            num_filters=128)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.merge_summary(grad_summaries)


        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.scalar_summary("loss", cnn.loss)
        acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.all_variables())

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.initialize_all_variables())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        #curretnly using the mini set and dev set
        batches = batch_iter(list(zip(train_data, train_labels)), batch_size, num_epochs)
        #dev_batches = batch_iter(list(zip(dev_data, dev_labels)), 200, 1)
        
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % evaluate_every == 0:
                clear_output()
                print("\nEvaluation:")
                dev_batches = batch_iter(list(zip(dev_data, dev_labels)), batch_size, 1)
                for dev_batch in dev_batches:
                    dev_x_batch, dev_y_batch = zip(*dev_batch)
                    dev_step(dev_x_batch, dev_y_batch, writer=dev_summary_writer)
                #dev_step(dev_data, dev_labels, writer=dev_summary_writer)
                clear_output()
            if current_step % checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

Saved model checkpoint to /home/ubuntu/266-Yelp/runs/1481616593/checkpoints/model-2400

2016-12-14T05:12:12.374554: step 2401, loss 0.980585, acc 0.526
2016-12-14T05:12:45.827062: step 2402, loss 0.96847, acc 0.566
2016-12-14T05:13:18.633000: step 2403, loss 0.988054, acc 0.56
2016-12-14T05:13:51.550991: step 2404, loss 0.913415, acc 0.554
2016-12-14T05:14:25.798584: step 2405, loss 0.892236, acc 0.594
2016-12-14T05:14:59.845407: step 2406, loss 0.886458, acc 0.594
2016-12-14T05:15:32.920905: step 2407, loss 0.919317, acc 0.596
2016-12-14T05:16:06.573031: step 2408, loss 0.779488, acc 0.72
2016-12-14T05:16:40.934638: step 2409, loss 0.789995, acc 0.702
2016-12-14T05:17:14.934242: step 2410, loss 0.915428, acc 0.588
2016-12-14T05:17:44.359897: step 2411, loss 0.876896, acc 0.604
2016-12-14T05:18:16.072451: step 2412, loss 1.01668, acc 0.572
2016-12-14T05:18:45.434911: step 2413, loss 0.762405, acc 0.692
2016-12-14T05:19:20.014371: step 2414, loss 0.908844, acc 0.628
2016-12-14T05:20:00.

KeyboardInterrupt: 

In [None]:
test_text = text_process(test_text)
test_labels = label_process(test_labels)

test_data = [[vocab_index.get(w) for w in r.split(' ')] for r in test_text]
test_data = [[w for w in r if w != None] for r in test_data]
test_data = data_expand(test_data, max_review_length)


In [None]:
checkpoint_dir="runs/1481516781/checkpoints/"
checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
batch_size = 1000
graph = tf.Graph()