## Read Yelp review and test a small chunk

In [13]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time
import yelp_util
import os
from itertools import chain

In [2]:
yelp_review = pd.read_pickle('data/yelp_academic_dataset_review.pickle') # read yelp review pickle

In [3]:
review_list = list(yelp_review.text.iloc[10000:11000]) # example of review

In [21]:
worddict, tksents = yelp_util.create_vocab(review_list) # create dictionary and tokenized sentenace

Building dictionary..
(134946, ' total words ', 10769, ' unique words')


In [22]:
review_words_stream = yelp_util.word2id(tksents, worddict)
review_words_stream = np.array(list(chain.from_iterable(review_words_stream)))

In [36]:
args = yelp_util.InputParameter()
# correct default parameters
args.vocab_size = review_words_stream.max() + 1
args.rnn_size = 200
args.batch_size = 30
args.seq_length = 50
args.num_layers = 2

In [30]:
tf.ops.reset_default_graph() # if we want to clear tensorflow clear graph

In [31]:
model = yelp_util.ReviewModel(args)

In [33]:
# where review words stream is the stream of word2vec index
sequence_streamer = yelp_util.SeqStream(review_words_stream, args.batch_size, args.seq_length)

In [34]:
# boiler plate
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

In [None]:
for e in xrange(args.num_epochs):
    sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e)))
    sequence_streamer.reset_batch_pointer()
    state = model.initial_state.eval(session=sess)
    for b in xrange(sequence_streamer.num_batches):
        start = time.time()
        x, y = sequence_streamer.next_batch()
        feed = {model.input_data: x, model.targets: y, model.initial_state: state}
        train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed)
        end = time.time()
        if ((e * sequence_streamer.num_batches + b) % 500) == 0:
            print "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                .format(e * sequence_streamer.num_batches + b,
                        args.num_epochs * sequence_streamer.num_batches,
                        e, train_loss, end - start)

## Save Tensorflow model

In [31]:
saver = tf.train.Saver(tf.all_variables()) # prepare to save the model
checkpoint_path = os.path.join('/Users/titipat/Desktop/Git/yelp_dataset_challenge/data/', 'model.ckpt')
saver.save(sess, checkpoint_path) # save model