In [1]:
# Import the required packages
import pyprind
from string import punctuation
import re
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
# Read the Data file
# Here we use the file(movie_data.csv) generated at the 
# beginning of sentimentAnalysis.ipynb

df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [3]:
# Preprocessing the data: Separate all the words & count each word's
# occurrence

# To find unique words in this large data set, using "sets" is not very 
# efficient. Hence, we'll use Counter from collections package.
from collections import Counter

counts = Counter()

# Progress Bar to keep us updated
pbar = pyprind.ProgBar(len(df['review']),
                       title='Counting word occurrences')

for i,review in enumerate(df['review']):
    text = ''.join([c if c not in punctuation else ' '+c+' '
                    for c in review]).lower()
    
    df.loc[i, 'review'] = text
    pbar.update()
    counts.update(text.split())

Counting word occurrences
0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:08:29


In [4]:
# Create a mapping
# Mapping each unique word to a integer
word_counts = sorted(counts, key=counts.get, reverse=True)
word_counts[:5]

['the', '.', ',', 'and', 'a']

In [5]:
word_to_int = {word: i for i, word in enumerate(word_counts, 1)}
mapped_reviews = []

pbar = pyprind.ProgBar(len(df['review']), title='Map reviews to ints')

for review in df['review']:
    mapped_reviews.append([word_to_int[word] for word in review.split()])
    
    pbar.update()

Map reviews to ints
0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:03


In [6]:
mapped_reviews[:3]

[[15,
  5646,
  3,
  1,
  2160,
  3977,
  26959,
  30,
  4824,
  1575,
  29,
  1133,
  7,
  1,
  316,
  19,
  720,
  1612,
  6,
  6653,
  753,
  3,
  14781,
  3,
  8680,
  2,
  33,
  1,
  14782,
  328,
  3,
  3612,
  6,
  2105,
  3,
  67,
  22,
  1947,
  15,
  1,
  8755,
  6,
  54,
  339,
  4,
  54,
  602,
  4712,
  15343,
  2,
  1732,
  19,
  121,
  170,
  323,
  3,
  1,
  565,
  953,
  25748,
  30,
  1464,
  19622,
  29,
  3,
  47,
  10,
  5,
  1121,
  1099,
  1362,
  18,
  58,
  3040,
  15,
  6002,
  25,
  44343,
  15,
  716,
  2,
  1544,
  2,
  5290,
  3192,
  4,
  1670,
  7,
  21518,
  3,
  1103,
  7,
  3921,
  1,
  434,
  26,
  37,
  1956,
  1801,
  2333,
  30,
  3679,
  3294,
  29,
  26,
  1,
  1285,
  6,
  507,
  5,
  286,
  2,
  1,
  5463,
  10558,
  4,
  92,
  34,
  2568,
  106,
  3,
  27,
  26,
  1,
  1457,
  6,
  1,
  5056,
  1362,
  1320,
  8833,
  30,
  623,
  12704,
  29,
  18,
  22,
  15,
  2762,
  6,
  1,
  3488,
  15,
  1,
  1358,
  8,
  21,
  3,
  44,
  1873,
  1,
  

In [7]:
# Defining same length sequences

# Sequence Length is a Hyper-Parameter
sequence_length = 200

# Initializing sequences to zeros of dimension: (#mapped_reviews x sequence_length)
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int)

for i, row in enumerate(mapped_reviews):
    review_arr = np.array(row)
    
    # Idea to fill each row from right to left
    sequences[i, -len(row):] = review_arr[-sequence_length:]
    

In [8]:
# Splitting the data-set into train & test
# We simply split it into halves, as the data-set is already shuffled

# Train Set
X_train = sequences[:25000, :]
y_train = df.loc[:25000, 'sentiment'].values

# Test Set
X_test = sequences[25000:, :]
y_test = df.loc[25000:, 'sentiment'].values

In [9]:
np.random.seed(123)

# Function to generate batches
# Using Generators is the most optimal way for splitting the data-set into batches
def create_batch_generator(x, y=None, batch_size=64):
    n_batches = len(x) // batch_size
    x = x[:n_batches*batch_size]
    
    if y is not None:
        y = y[:n_batches*batch_size]
        
    for i in range(0, len(x), batch_size):
        if y is not None:
            yield(x[i:i+batch_size], y[i:i+batch_size])
        else:
            yield(x[i:i+batch_size])
        

In [10]:
# RNN Class to perform sentiment analysis
class SentiRNN(object):
    
    def __init__(self, n_words, seq_len=200, lstm_size=256, num_layers=1,
                 batch_size=64, learning_rate=0.0001, embed_size=200):
        
        self.n_words = n_words
        self.seq_len = seq_len
        self.lstm_size = lstm_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embed_size = embed_size
        
        self.g = tf.Graph()
        
        with self.g.as_default():
            tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            
            
    def build(self):
        
        # Placeholders for input data
        tf_x = tf.placeholder(tf.int32, shape=(self.batch_size, self.seq_len), name='tf_x')
        tf_y = tf.placeholder(tf.float32, shape=(self.batch_size), name='tf_y')
        tf_keepprob = tf.placeholder(tf.float32, name='tf_keepprob')
        
        # Embedding is a feature-learning technique that we utilize to learn the salient 
        # features to represent the words in our data-set. This transformation is necessary
        # as RNN's require their inputs to be vectors of continous values
        
        # We'll use tf.nn.embedding_lookup that maps each integer that corresponds
        # to a unique word, to a row of the trainable matrix
        
        # Embedding layer to transform the input
        embedding = tf.Variable(
                    tf.random_uniform((self.n_words, self.embed_size), minval=-1, maxval=1),
                                      name='embedding')
        
        embed_x = tf.nn.embedding_lookup(embedding, tf_x, name='embed_x')
        
        # Creating the RNN cell, applying the dropout i.e. Stacking RNN cells
        # with dropout
        
        cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(
                                              tf.contrib.rnn.BasicLSTMCell(self.lstm_size),
                                              output_keep_prob=tf_keepprob)
                                              for i in range(self.num_layers)])
        
        
        # Initial State of RNN
        self.initial_state = cells.zero_state(self.batch_size, tf.float32)
        
        print(' <<< Initial State >>>', self.initial_state)
        
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cells, embed_x,
                                                           initial_state=self.initial_state)
        
        # lstm_outputs shape:
        # [batch_size, max_time, cells.output_size]
        print('\n <<< lstm_output >>>', lstm_outputs)
        print('\n <<< final state >>>', self.final_state)
        
        logits = tf.layers.dense(inputs=lstm_outputs[:, -1],
                                 units=1, activation=None, name='logits')
        
        logits = tf.squeeze(logits, name='logits_squeezed')
        print('\n <<< logits >>> ', logits)
        
        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        
        predictions = {
                        'probabilities': y_proba,
                        'labels': tf.cast(tf.round(y_proba), tf.int32,
                                          name='labels')
        }
        
        print('\n <<< predictions >>>', predictions)
        
        # Cost Function
        cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y, 
                                                                     logits=logits,
                                                                     name='cost'))
        
        # Optimizer
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')
    
    # Training Method
    def train(self, X_train, y_train, num_epochs):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            iteration = 1
            
            for epoch in range(num_epochs):
                state = sess.run(self.initial_state)
                
                for batch_x, batch_y in create_batch_generator(
                                        X_train, y_train, self.batch_size):
                    
                    feed = {'tf_x:0': batch_x,
                            'tf_y:0': batch_y,
                            'tf_keepprob:0': 0.5,
                            self.initial_state: state}
                    
                    loss, _, state = sess.run(['cost:0', 'train_op',
                                              self.final_state],
                                              feed_dict=feed)
                    
                    if iteration % 5 == 0:
                        print("Epoch: %d/%d Iteration: %d"
                              "| Train Loss: %.5f" % (epoch+1, num_epochs, iteration, loss))
                        
                if((epoch+1)%10 == 0):
                    # Saving our model
                    self.saver.save(sess, "model/sentiment-%d.ckpt" % epoch)
            
    
    # Prediction Method
    def predict(self, X_data, return_proba=False):
        preds = []
        
        with tf.Session(graph=self.g) as sess:
            # Restoring our model
            self.saver.restore(sess, tf.train.latest_checkpoint('./model/'))
            
            test_state = sess.run(self.initial_state)
            
            for i, batch_x in enumerate(
                        create_batch_generator(X_data, None, batch_size=self.batch_size)):
                
                feed = {'tf_x:0': batch_x,
                        'tf_keepprob:0' : 1.0,
                        self.initial_state : test_state}
                
                if(return_proba):
                    pred, test_state = sess.run(['probabilities:0', self.final_state], 
                                                feed_dict=feed)
                    
                else:
                    pred, test_state = sess.run(['labels:0', self.final_state ],
                                                feed_dict=feed)
                    
                preds.append(pred)    
        
        return(np.concatenate(preds))

In [11]:
n_words = max(list(word_to_int.values())) + 1

# Because of small size of our data-set, we set num_layers=1

# Single layer RNN may generalise better to unseen data, as it 
# is less likely to overfit
rnn = SentiRNN(n_words=n_words,
               seq_len=sequence_length,
               embed_size=256,
               lstm_size=128,
               num_layers=1,
               batch_size=100,
               learning_rate=0.001
              )

 <<< Initial State >>> (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)

 <<< lstm_output >>> Tensor("rnn/transpose_1:0", shape=(100, 200, 128), dtype=float32)

 <<< final state >>> (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(100, 128) dtype=float32>),)

 <<< logits >>>  Tensor("logits_squeezed:0", shape=(100,), dtype=float32)

 <<< predictions >>> {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>, 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>}


In [12]:
# Training our RNN

rnn.train(X_train, y_train, num_epochs=15)

In [13]:
preds = rnn.predict(X_test)

y_true = y_test[:len(preds)]

# Test Accuracy
print('Test Acc.: %.3f' % (np.sum(preds == y_true) / len(y_true)))

INFO:tensorflow:Restoring parameters from ./model/sentiment-9.ckpt
Test Acc.: 0.840


In [14]:
# Alternatively, we can use this for calculating accuracy
proba = rnn.predict(X_test, return_proba= True)

y_true = y_test[:len(preds)]

# Test Accuracy
print('Test Acc.: %.3f' % (np.sum(preds == y_true) / len(y_true)))

INFO:tensorflow:Restoring parameters from ./model/sentiment-9.ckpt
Test Acc.: 0.840
