In [1]:
import tensorflow as tf
import numpy as np
import utils 

In [2]:
reviews, labels = utils.load_text()

In [3]:
reviews, words = utils.preprocess(reviews)
print(reviews[0])

Remove punctuation......
bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   


In [4]:
vocab_to_int, int_to_vocab = utils.create_lookup_tabels(words)    #index starts 1 not 0
print([int_to_vocab[i] for i in range(1,6)])

['the', 'and', 'a', 'of', 'to']


In [5]:
labels = np.array([label=='positive' for label in labels.split('\n')])*1
print(labels[:5])
print(labels.shape)

[1 0 1 0 1]
(25001,)


In [6]:
reviews_ints=[]
for review in reviews:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [7]:
reviews_ints_ = reviews_ints[:]
print(len(reviews_ints_))
reviews_ints=[x for x in reviews_ints_ if len(x)>0]
labels=[labels[i] for i,x in enumerate(reviews_ints_) if len(x)>0]
labels=np.array(labels)
print(len(reviews_ints))
print(len(labels))

25001
25000
25000


In [8]:
seq_len = 200
features = np.zeros([len(reviews_ints), seq_len], dtype=np.int32)
for i,x in enumerate(reviews_ints):
    l = seq_len if seq_len<len(x) else len(x)
    features[i][-l:]=x[:l]
print(features[0,:100])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 21909   308     6     3  1051   207     8  2143    32     1   171    57
    15    49    81  5793    44   382   110   140    15  5224    60   154
     9     1  4985  5902   475    71     5   260    12 21909   308    13
  1980     6    74  2406]


In [9]:
train_x, train_y, valid_x, valid_y, test_x, test_y = utils.split_data(features, labels, 0.8, 0.1, 0.1)

Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200) 
All set: 		(25000, 200)


In [10]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001
embed_size = 300

In [11]:
n_words = len(vocab_to_int)+1 #Adding 1 because we use 0's for padding, dictionary started at 1
inputs_ = tf.placeholder(shape=[batch_size,seq_len], dtype=tf.int32, name="inputs")
labels_ = tf.placeholder(shape=[batch_size,1], dtype=tf.float32, name="labels")
keep_prob = tf.placeholder(dtype=tf.float32, name="keep_prob")
step = tf.Variable(0,dtype=tf.int32,name="global_step")

In [12]:
embedding = tf.Variable(tf.random_uniform([n_words, embed_size],-1,1), name="embedding")
embed = tf.nn.embedding_lookup(embedding, inputs_)

In [13]:
def build_cell(lstm_size,keep_prob):
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop= tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    return drop
cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size,keep_prob) for _ in range(lstm_layers)])
initial_state=cell.zero_state(batch_size, tf.float32)

outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [14]:
print(outputs.get_shape().as_list())
logits = tf.contrib.layers.fully_connected(outputs[:,-1], 1, activation_fn=None)
print(logits.get_shape().as_list())
predictions = tf.nn.sigmoid(logits,name="predictions")

cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_, logits=logits), name="cost")
tf.summary.scalar('cost',cost)

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost,global_step=step, name="opt")

correct_pred = tf.equal(tf.round(predictions), labels_)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="accuracy")

summary=tf.summary.merge_all()

[500, 200, 256]
[500, 1]


In [15]:
epochs = 10

with tf.Session() as sess:
    saver=tf.train.Saver()
    writer=tf.summary.FileWriter('data/sentiment_graph',sess.graph)
    sess.run(tf.global_variables_initializer())
    for e in range(epochs):
        state = sess.run(initial_state)       
        for ii, (x, y) in enumerate(utils.get_batches(train_x, train_y, batch_size), 1):    # index starts from 1
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state,summaries, global_step, _ = sess.run([cost, final_state,summary,step,optimizer], feed_dict=feed)
            writer.add_summary(summaries, global_step=global_step)
            if global_step%5==0:            
                print("Epoch: {}/{}".format(e+1, epochs),
                      "Iteration: {}".format(global_step),
                      "Train loss: {:.3f}".format(loss))

            if global_step%25==0:
                saver.save(sess, "data/sentiment_checkpoints/sentiment.ckpt", global_step=step)
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in utils.get_batches(valid_x, valid_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
    writer.flush()
    writer.close()  

Epoch: 1/10 Iteration: 5 Train loss: 0.672
Epoch: 1/10 Iteration: 10 Train loss: 0.683
Epoch: 1/10 Iteration: 15 Train loss: 0.615
Epoch: 1/10 Iteration: 20 Train loss: 0.627
Epoch: 1/10 Iteration: 25 Train loss: 0.653
Val acc: 0.610
Epoch: 1/10 Iteration: 30 Train loss: 0.651
Epoch: 1/10 Iteration: 35 Train loss: 0.639
Epoch: 1/10 Iteration: 40 Train loss: 0.597
Epoch: 2/10 Iteration: 45 Train loss: 0.641
Epoch: 2/10 Iteration: 50 Train loss: 0.600
Val acc: 0.655
Epoch: 2/10 Iteration: 55 Train loss: 0.580
Epoch: 2/10 Iteration: 60 Train loss: 0.578
Epoch: 2/10 Iteration: 65 Train loss: 0.553
Epoch: 2/10 Iteration: 70 Train loss: 0.573
Epoch: 2/10 Iteration: 75 Train loss: 0.539
Val acc: 0.743
Epoch: 2/10 Iteration: 80 Train loss: 0.529
Epoch: 3/10 Iteration: 85 Train loss: 0.344
Epoch: 3/10 Iteration: 90 Train loss: 0.462
Epoch: 3/10 Iteration: 95 Train loss: 0.380
Epoch: 3/10 Iteration: 100 Train loss: 0.452
Val acc: 0.736
Epoch: 3/10 Iteration: 105 Train loss: 0.439
Epoch: 3/10 Ite

In [18]:
test_acc = []
with tf.Session() as sess:
    saver.restore(sess, 'data/sentiment_checkpoints/sentiment.ckpt-300')
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(utils.get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from data/sentiment_checkpoints/sentiment.ckpt-300
Test accuracy: 0.822
