In [1]:
import numpy as np
import tensorflow as tf

### Read Files

In [2]:
with open('../sentiment-network/reviews.txt','r') as f:
    reviews = f.read()
with open('../sentiment-network/labels.txt', 'r') as f:
    labels = f.read()

### Preprocessing

#### Remove Punctuations and Newline

In [3]:
from string import punctuation
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')
all_text = ''.join(reviews)
words = all_text.split()

In [4]:
reviews[0]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   '

In [5]:
words[:20]

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 'such']

#### Build Vocab to Integer and Integer to Vocab Dictionaries

In [6]:
from collections import Counter
#Create a dictionary that maps vocab words to integers
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)

In [7]:
# the most frequent 5 words
vocab[0:5]

['the', 'and', 'a', 'of', 'to']

In [8]:
vocab_to_int = {word: i for i, word in enumerate(vocab,start=1)}
int_to_vocab = {vocab_to_int[word]: word for word in words}

In [9]:
int_to_vocab[22011]

'souped'

In [10]:
vocab_to_int['bromwell']

21837

#### Convert Reviews into Integers use Dictionary

In [11]:
reviews_ints = []
for review in reviews:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

#### Convert Labels into One-hot

In [12]:
labels[0:100]

'positive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nn'

In [13]:
# Convert labels to 1s and 0s for 'positive' and 'negative'
labels = labels.split('\n')
labels_onehot = []
for label in labels:
    if (label == "positive"):
        labels_onehot.append(1)
    else:
        labels_onehot.append(0)
labels = labels_onehot
labels[0:10]

[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

#### Make Sure All Reviews Are the Same Length, So RNN can be Used

In [57]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Minimum reviews length: {}".format(min(review_lens)))
print("Maximum review length: {}".format(max(review_lens)))

Minimum reviews length: 0
Maximum review length: 2514


#### Try to remove the zero-length review 

In [14]:
for i in range(len(reviews_ints)):
    if (len(reviews_ints[i]) == 0):
        print(i)
        del reviews_ints[i]
        del labels[i]

25000


#### Break Very Long Reviews into Several Shorter Reviews

In [15]:
for i in range(len(reviews_ints)):
    while (len(reviews_ints[i]) > 200):
        reviews_ints.append(reviews_ints[i][0:200])
        reviews_ints[i] = reviews_ints[i][200:]
        labels.append(labels[i])

#### Pad Short Reviews with Space, so Every Review has length 200

In [16]:
for i in range(len(reviews_ints)):
    if (len(reviews_ints[i]) < 200):
        reviews_ints[i] = reviews_ints[i] + [0] * (200 - len(reviews_ints[i]))

#### Test Every Review has length 200

In [67]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Minimum reviews length: {}".format(min(review_lens)))
print("Maximum review length: {}".format(max(review_lens)))

Minimum reviews length: 200
Maximum review length: 200


#### Convert Everything to Numpy Array

In [17]:
features = np.array(reviews_ints)
labels = np.array(labels)

#### Try to Free Some Variables

In [18]:
dir()

['Counter',
 'In',
 'Out',
 '_',
 '_10',
 '_12',
 '_13',
 '_4',
 '_5',
 '_7',
 '_9',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_i10',
 '_i11',
 '_i12',
 '_i13',
 '_i14',
 '_i15',
 '_i16',
 '_i17',
 '_i18',
 '_i2',
 '_i3',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'all_text',
 'counts',
 'exit',
 'f',
 'features',
 'get_ipython',
 'i',
 'int_to_vocab',
 'label',
 'labels',
 'labels_onehot',
 'np',
 'punctuation',
 'quit',
 'review',
 'reviews',
 'reviews_ints',
 'tf',
 'vocab',
 'vocab_to_int',
 'words']

### Divide Data Set into Train, Validation, Test

In [22]:
split_frac = 0.8
split_upto = int(split_frac * len(labels))

train_x, val_x = features[:split_upto], features[split_upto:]
train_y, val_y = labels[:split_upto], labels[split_upto:]

split_upto = int((1-split_frac)/2.0 * len(labels))
val_x, test_x = val_x[:split_upto], val_x[split_upto:]
val_y, test_y = val_y[:split_upto], val_y[split_upto:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(33136, 200) 
Validation set: 	(4141, 200) 
Test set: 		(4143, 200)


### Preprocessing Done, Build the Model

In [72]:
lstm_size = 256
lstm_layers = 1
batch_size = 100
learning_rate = 0.01

In [73]:
n_words = len(vocab_to_int) + 1 # add 1 because we use '0' as padding, and '0' is not in vocab_to_int

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, (None, 200))
    labels_ = tf.placeholder(tf.int32, (None))
    keep_prob = tf.placeholder(tf.float32)

In [74]:
with tf.Session(graph=graph) as sess:
    print(sess.run(inputs_, {inputs_: features[0].reshape(-1,200)}))
    print(sess.run(labels_, {labels_: labels[0]}))

[[21837   308     6     3  1050   207     8  2139    32     1   171    57
     15    49    81  5824    44   382   110   140    15  5216    60   154
      9     1  4976  5875   475    71     5   260    12 21837   308    13
   1981     6    74  2399     5   614    73     6  5216     1 24959     5
   1990 10208     1  5809  1504    36    51    66   204   145    67  1201
   5216 20494     1 38811     4     1   221   883    31  3005    71     4
      1  5791    10   686     2    67  1504    54    10   216     1   384
      9    62     3  1406  3690   783     5  3501   180     1   382    10
   1213 13621    32   308     3   349   341  2926    10   143   127     5
   7693    30     4   129  5216  1406  2336     5 21837   308    10   528
     12   109  1448     4    60   543   102    12 21837   308     6   227
   4150    48     3  2219    12     8   215    23     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0 

In [75]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 200

with graph.as_default():
    embedding = tf.Variable(tf.truncated_normal((n_words, embed_size), stddev = 0.05))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [76]:
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(embed, {inputs_: features[0:batch_size].reshape(-1,200)}).shape)

(100, 200, 200)


In [77]:
def build_layer(lstm_out_size, keep_prob):
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_out_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob = keep_prob)
    return drop
    
with graph.as_default():
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell(
        [build_layer(lstm_size, keep_prob) for _ in range(lstm_layers)])
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [78]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)

In [79]:
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    feed_dict = {inputs_: features[0:batch_size].reshape(-1,200), keep_prob: 0.7}
    print(sess.run(outputs, feed_dict = feed_dict).shape)
    print(sess.run(outputs, feed_dict = feed_dict)[:,-1].shape)

(100, 200, 256)
(100, 256)


In [80]:
with graph.as_default():
    '''logits = tf.layers.dense(outputs[:,-1], 1)
    predictions = tf.nn.sigmoid(logits)
    cost_per = tf.nn.sigmoid_cross_entropy_with_logits(
        logits=logits, labels=tf.cast( tf.reshape(labels_, (batch_size,1)), tf.float32))
    cost = tf.reduce_mean(cost_per)'''
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)

In [81]:
with graph.as_default():
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))

In [82]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [83]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [84]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y,
                    keep_prob: 0.7,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y,
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.2507499158382416
Epoch: 0/10 Iteration: 10 Train loss: 0.25012221932411194
Epoch: 0/10 Iteration: 15 Train loss: 0.26045557856559753
Epoch: 0/10 Iteration: 20 Train loss: 0.2594509422779083
Epoch: 0/10 Iteration: 25 Train loss: 0.25753548741340637
Val acc: 0.499
Epoch: 0/10 Iteration: 30 Train loss: 0.2540750503540039
Epoch: 0/10 Iteration: 35 Train loss: 0.25142931938171387
Epoch: 0/10 Iteration: 40 Train loss: 0.2521900534629822
Epoch: 0/10 Iteration: 45 Train loss: 0.25275781750679016
Epoch: 0/10 Iteration: 50 Train loss: 0.2575530409812927
Val acc: 0.501
Epoch: 0/10 Iteration: 55 Train loss: 0.26363039016723633
Epoch: 0/10 Iteration: 60 Train loss: 0.25687193870544434
Epoch: 0/10 Iteration: 65 Train loss: 0.2533154785633087
Epoch: 0/10 Iteration: 70 Train loss: 0.253427118062973
Epoch: 0/10 Iteration: 75 Train loss: 0.2531026601791382
Val acc: 0.500
Epoch: 0/10 Iteration: 80 Train loss: 0.2523159384727478
Epoch: 0/10 Iteration: 85 Train loss: 

KeyboardInterrupt: 