In [1]:
import numpy as np
import tensorflow as tf

In [2]:
with open('../sentiment-network/reviews.txt', 'r') as f:
    reviews = f.read()
with open('../sentiment-network/labels.txt', 'r') as f:
    labels = f.read()

In [3]:
reviews[:2000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

In [7]:
labels[:200]

'positive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npo'

In [8]:
from string import punctuation
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

In [9]:
all_text[:2000]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t    story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  violent m

In [10]:
words[:20]

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 'such']

In [11]:
# Create your dictionary that maps vocab words to integers here
from collections import Counter
counts = Counter(words)
# print(counts)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word:i for i, word in enumerate(vocab, 1)}
print(len(vocab_to_int))

74072


In [13]:
print("type of vocab", type(vocab))
print("first word:", vocab[0])
print("last word:", vocab[-1])
print(vocab_to_int["the"])
print(vocab_to_int["sensate"])

type of vocab <class 'list'>
first word: the
last word: sensate
1
74072


In [14]:
# Convert the reviews to integers, same shape as reviews list, but with integers
reviews_ints = []
for review in reviews:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])
    
print(len(reviews_ints))

25001


In [15]:
labels = labels.split('\n')
print(labels[:20])

['positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative']


In [16]:
# Convert labels to 1s and 0s for 'positive' and 'negative'
labels = np.array([1 if label == 'positive' else 0 for label in labels])
print(len(labels))

25001


In [17]:
from collections import Counter
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [18]:
# Filter out that review with 0 length
non_zero_idx = [idx for idx, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_idx)

25000

In [19]:
reviews_ints[-1]

[]

In [20]:
reviews_ints = [reviews_ints[idx] for idx in non_zero_idx]
labels = np.array([labels[idx] for idx in non_zero_idx])

In [25]:
print(len(reviews_ints))
print(len(reviews_ints[0]))
print(len(reviews_ints[1]))
print(len(reviews_ints[2]))
print(labels.shape)

25000
140
114
447
(25000,)


In [26]:
features = reviews_ints

In [27]:
split_frac = 0.8
split = int(len(features) * split_frac)
train_x, val_x = features[:split], features[split:]
train_y, val_y = labels[:split], labels[split:]

split = int(len(val_x) * 0.5)
val_x, test_x = val_x[:split], val_x[split:]
val_y, test_y = val_y[:split], val_y[split:]

In [None]:
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(len(train_x)), 
      "Train set: \t\t{}".format(len(train_y)), 
      "\nValidation set: \t{}".format(len(val_x)),
      "\nTest set: \t\t{}".format(len(test_x)))

print(len(train_y))
print(len(val_y))

## Build the graph

Here, we'll build the graph. First up, defining the hyperparameters.

* `lstm_size`: Number of units in the hidden layers in the LSTM cells. Usually larger is better performance wise. Common values are 128, 256, 512, etc.
* `lstm_layers`: Number of LSTM layers in the network. I'd start with 1, then add more if I'm underfitting.
* `batch_size`: The number of reviews to feed the network in one training pass. Typically this should be set as high as you can go without running out of memory.
* `learning_rate`: Learning rate

In [58]:
lstm_size = 256 
lstm_layers = 1
# batch_size = 500
batch_size = 1
learning_rate = 0.001

In [59]:
n_words = len(vocab_to_int) + 1 # Adding 1 because we use 0's for padding, dictionary started at 1

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [batch_size, None], name = "inputs")
    labels_ = tf.placeholder(tf.int32, [batch_size, None], name = "labels")
    keep_prob = tf.placeholder(tf.float32, name = "keep_prob")

In [60]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_dim = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_dim), -1, 1), name = "embedding")
    
    # What the embedding parameter really is ?
    embed = tf.nn.embedding_lookup(embedding, inputs_)
    
    # The results of the lookup are concatenated into a dense tensor. 
    # The returned tensor has shape shape(ids) + shape(params)[1:].
    # If    the inputs_ has shape (batch_size, step_num), each element is an id to be looked up in params
    #       the params (embedding tensor or lookup table) has shape [n_words, embed_dim], each row is an embedding 
    #           vector and the index for each row (i.e., the first dimension) is the id of that row.
    # Then
    #       the returned tensor would have shape (bacth_size, step_num, embed_dim), since each id in inputs is
    #       is replaced by an embeding vector looked-up by that id.

In [61]:
with graph.as_default():
    
    def build_cell(lstm_size, keep_prob):
        # Your basic LSTM cell
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)

        # Add dropout to the cell
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        return drop
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size, keep_prob) for _ in range(lstm_layers)])
    
    # Getting an initial state of all zeros
    # Note that the cell need the batch size to initialize the cell's internal state
    initial_state = cell.zero_state(batch_size, tf.float32)

In [62]:
with graph.as_default():
    
    # the dynamic rnn takes an arguments named 'inputs' that should have shape of [batch_size, max_time, ...]
    # the max_time is the step_num
    # The first two dimensions must match across all the inputs, 
    # but otherwise the ranks and other shape components may differ.
    
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [63]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [64]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [74]:
epochs = 4

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii in range(len(train_x)):
            
            # train_x[ii] is a list of words of a review, 
            # here we convert it to a numpy array and reshape it to a 2D array with first dimention has size 1
            # train_y[li] is a scalar value,
            # here we convert it to a numpy array and reshape it to a 2D array with first dimention has size 1
            x = np.asarray(train_x[ii]).reshape(1, -1)
            y = np.asarray(train_y[ii]).reshape(1, -1)
            feed = {inputs_: x,
                    labels_: y,   # change ths shape of y to [batch_size, 1]
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%2000==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%10000==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for jj in range(len(val_y)):
                    x = np.asarray(val_x[jj]).reshape(1, -1)
                    y = np.asarray(val_y[jj]).reshape(1, -1)
                    feed = {inputs_: x,
                            labels_: y,
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/4 Iteration: 2000 Train loss: 0.028
Epoch: 0/4 Iteration: 4000 Train loss: 0.379
Epoch: 0/4 Iteration: 6000 Train loss: 0.011
Epoch: 0/4 Iteration: 8000 Train loss: 0.021
Epoch: 0/4 Iteration: 10000 Train loss: 0.014
Val acc: 0.792
Epoch: 0/4 Iteration: 12000 Train loss: 0.106
Epoch: 0/4 Iteration: 14000 Train loss: 0.393
Epoch: 0/4 Iteration: 16000 Train loss: 0.047
Epoch: 0/4 Iteration: 18000 Train loss: 0.022
Epoch: 0/4 Iteration: 20000 Train loss: 0.030
Val acc: 0.817
Epoch: 1/4 Iteration: 22000 Train loss: 0.005
Epoch: 1/4 Iteration: 24000 Train loss: 0.007
Epoch: 1/4 Iteration: 26000 Train loss: 0.000
Epoch: 1/4 Iteration: 28000 Train loss: 0.001
Epoch: 1/4 Iteration: 30000 Train loss: 0.001
Val acc: 0.821
Epoch: 1/4 Iteration: 32000 Train loss: 0.054
Epoch: 1/4 Iteration: 34000 Train loss: 0.000
Epoch: 1/4 Iteration: 36000 Train loss: 0.008
Epoch: 1/4 Iteration: 38000 Train loss: 0.002
Epoch: 1/4 Iteration: 40000 Train loss: 0.000
Val acc: 0.820
Epoch: 2/4 Iteration: 42

In [77]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for zz in range(len(test_x)):
        x = np.asarray(test_x[zz]).reshape(1, -1)
        y = np.asarray(test_y[zz]).reshape(1, -1)
        feed = {inputs_: x,
                labels_: y,
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints/sentiment.ckpt
Test accuracy: 0.814
