Heavily inspired by: https://towardsdatascience.com/sentiment-analysis-using-rnns-lstm-60871fa6aeba

0. Have Tensorflow version 1:

In [1]:
!pip install tensorflow==1.14

Collecting tensorflow==1.14
[?25l  Downloading https://files.pythonhosted.org/packages/de/f0/96fb2e0412ae9692dbf400e5b04432885f677ad6241c088ccc5fe7724d69/tensorflow-1.14.0-cp36-cp36m-manylinux1_x86_64.whl (109.2MB)
[K     |████████████████████████████████| 109.2MB 74kB/s 
Collecting tensorboard<1.15.0,>=1.14.0
[?25l  Downloading https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.2MB 45.0MB/s 
Collecting tensorflow-estimator<1.15.0rc0,>=1.14.0rc0
[?25l  Downloading https://files.pythonhosted.org/packages/3c/d5/21860a5b11caf0678fbc8319341b0ae21a07156911132e0e71bffed0510d/tensorflow_estimator-1.14.0-py2.py3-none-any.whl (488kB)
[K     |████████████████████████████████| 491kB 38.2MB/s 
Installing collected packages: tensorboard, tensorflow-estimator, tensorflow
  Found existing installation: tensorboard 2.2.1
    Uninstalling tensorboard-2.2.

1. Clone cleaned Indonesian tweets:

In [2]:
!git clone https://github.com/ridife/dataset-idsa.git

Cloning into 'dataset-idsa'...
remote: Enumerating objects: 6, done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 6[K
Unpacking objects: 100% (6/6), done.


2. Import packages, load data, and lightly process:

In [3]:
import numpy as np
import tensorflow as tf
from string import punctuation
from collections import Counter

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [0]:
tweets = []
sentiments = []
with open('/content/dataset-idsa/Indonesian Sentiment Twitter Dataset Labeled.csv', 'r', encoding='utf-8') as inf, open('poli_data_format.csv', 'r', encoding='utf-8') as inf2:
  for f in [inf, inf2]:
    is_header = True
    for tweet in f.readlines():
      if is_header: # skip first line
        is_header = False
        continue
      if tweet.startswith('0'):
        continue # ignore neutral tweets
      elif tweet.startswith('-1'):
        sentiments.append(tweet[:2])
        tweet = tweet[3:-1]
      else:
        sentiments.append(tweet[:1])
        tweet = tweet[2:-1]
      tweet = ''.join([char for char in tweet if char not in punctuation])
      tweets.append(tweet)

In [0]:
labels = np.array([1 if sentiment == '1' else 0 for sentiment in sentiments])

3. Get vocabulary of tweets, mapping words to integers, so that we may convert tweets into integers to be passed onto the network:

In [137]:
plain_text = ' '.join([tweet for tweet in tweets])
words = plain_text.split()

counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}

tweets_ints = []
for tweet in tweets:
  tweets_ints.append([vocab_to_int[word] for word in tweet.split()])

tweets_lens = Counter([len(x) for x in tweets_ints])
tweets_ints = [tweet[:max(tweets_lens)] for tweet in tweets_ints]
print("Maximum tweet length: {}".format(max(tweets_lens)))

Maximum tweet length: 28


4. Make array with padding of zeros, with each array the size of the longest tweet:

In [138]:
seq_len = 19
features = np.zeros((len(tweets_ints), seq_len), dtype=int)
for i, row in enumerate(tweets_ints):
  features[i, -len(row):] = np.array(row)[:seq_len]
features[:3,:seq_len]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         643, 1013,   15,    3,    1, 1793,   17, 5840],
       [   0,    0,    0,    4, 3686,  103,  243,   11,  690,   20,    5,
           1, 1794,   25,   53, 1552,   60, 5841,   49],
       [ 236,  487,    7,  399, 2157, 1111, 2687, 2688, 2689, 2690, 1217,
        1014, 2691,   11,  743, 2158,  148,  146,    2]])

In [139]:
print("Number of features: \t{}".format(len(features)),
      "\nSample of feature: \t{}".format(features[0]))

Number of features: 	5705 
Sample of feature: 	[   0    0    0    0    0    0    0    0    0    0    0  643 1013   15
    3    1 1793   17 5840]


6. Create the training/validation/testing sets:

In [150]:
split_frac = 0.9

split_index = int(split_frac * len(features))

train_x, val_x = features[:split_index], features[split_index:] 
train_y, val_y = labels[:split_index], labels[split_index:]

split_frac = 0.5
split_index = int(split_frac * len(val_x))

val_x, test_x = val_x[:split_index], val_x[split_index:]
val_y, test_y = val_y[:split_index], val_y[split_index:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(5134, 19) 
Validation set: 	(285, 19) 
Test set: 		(286, 19)


7. Define the constants for the training process:

In [0]:
lstm_size = 256       # number of units in hidden layers in the LSTM cells
lstm_layers = 2       # number of LSTM layers
batch_size = 64       # number of tweets to give network in one training batch
learning_rate = 0.01

In [0]:
n_words = len(vocab_to_int) + 1 # add 1 for 0 added to vocab

graph = tf.Graph()
# add nodes
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

8. Add embedding layer (letting network learn the weights):

In [0]:
embed_size = 300     # number of units in embedding layer

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

9. Set up the basics of the LSTM cells and RNN:

In [0]:
with graph.as_default():
    with tf.name_scope("RNN_layers"):
        def lstm_cell():
            lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
            return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob) # add dropout to the cell

        # create the multiple LSTM layer stack, for deep learning
        cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])

        # assign initial state of all zeros
        initial_state = cell.zero_state(batch_size, tf.float32)

10. RNN forward pass from initial state; returns output from each time step and hidden layer's final state:

In [155]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)



11. Use the final output as the prediction for the sentiment value:

In [156]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)



12. Create nodes to validate the accuracy of the batch and obtain confusion matrix of individual batches:

In [0]:
with graph.as_default():
    correct_prediction = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    #correct_prediction = tf.cast(correct_pred, dtype=tf.bool)
    false_prediction = tf.logical_not(correct_prediction)

    is_label_one = tf.cast(labels_, dtype=tf.bool) #tf.ones([1], tf.int32)
    is_label_zero = tf.logical_not(is_label_one)

    true_positives = tf.reduce_sum(tf.to_int32(tf.logical_and(correct_prediction,is_label_one)))
    false_positives = tf.reduce_sum(tf.to_int32(tf.logical_and(false_prediction, is_label_zero)))
    true_negatives = tf.reduce_sum(tf.to_int32(tf.logical_and(correct_prediction, is_label_zero)))
    false_negatives = tf.reduce_sum(tf.to_int32(tf.logical_and(false_prediction, is_label_one)))

    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

13. Train the model in batches:

In [0]:
def get_batches(x, y, batch_size=64):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for i in range(0, len(x), batch_size):
        yield x[i:i+batch_size], y[i:i+batch_size]

In [159]:
epochs = 5

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(1, epochs+1):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
            saver.save(sess, "checkpoints/sentiment.ckpt")
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 1/5 Iteration: 5 Train loss: 0.341
Epoch: 1/5 Iteration: 10 Train loss: 0.262
Epoch: 1/5 Iteration: 15 Train loss: 0.248
Epoch: 1/5 Iteration: 20 Train loss: 0.334
Epoch: 1/5 Iteration: 25 Train loss: 0.239
Val acc: 0.480
Epoch: 1/5 Iteration: 30 Train loss: 0.279
Epoch: 1/5 Iteration: 35 Train loss: 0.264
Epoch: 1/5 Iteration: 40 Train loss: 0.236
Epoch: 1/5 Iteration: 45 Train loss: 0.252
Epoch: 1/5 Iteration: 50 Train loss: 0.255
Val acc: 0.469
Epoch: 1/5 Iteration: 55 Train loss: 0.246
Epoch: 1/5 Iteration: 60 Train loss: 0.277
Epoch: 1/5 Iteration: 65 Train loss: 0.276
Epoch: 1/5 Iteration: 70 Train loss: 0.240
Epoch: 1/5 Iteration: 75 Train loss: 0.239
Val acc: 0.477
Epoch: 1/5 Iteration: 80 Train loss: 0.245
Epoch: 2/5 Iteration: 85 Train loss: 0.250
Epoch: 2/5 Iteration: 90 Train loss: 0.252
Epoch: 2/5 Iteration: 95 Train loss: 0.223
Epoch: 2/5 Iteration: 100 Train loss: 0.260
Val acc: 0.535
Epoch: 2/5 Iteration: 105 Train loss: 0.225
Epoch: 2/5 Iteration: 110 Train loss

14. Test the model, listing all predictions and then obtaining the average:

In [160]:
test_acc = []
test_prec = []
test_rec = []

with tf.Session(graph=graph) as sess:
    saver.restore(sess, "checkpoints/sentiment.ckpt")
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, batch_prec, batch_rec = sess.run([accuracy, precision, recall], feed_dict=feed)

        test_acc.append(batch_acc)
        test_prec.append(batch_prec)
        test_rec.append(batch_rec)

INFO:tensorflow:Restoring parameters from checkpoints/sentiment.ckpt


In [161]:
prec = np.mean(test_prec)
rec = np.mean(test_rec)
acc = np.mean(test_acc)
print("Test precision: {:.3f}".format(prec))
print("Test recall: {:.3f}".format(rec))
print("Test accuracy: {:.3f}".format(acc))
print("Test F1 score: {:.3f}".format(2*((prec*rec) / (prec+rec))))

Test precision: 0.739
Test recall: 0.746
Test accuracy: 0.656
Test F1 score: 0.743
