<a href="https://colab.research.google.com/github/yassine-fetoui/NLP/blob/main/Sentiment_Analysis_RNN_Graph_V1/Sentiment_Analysis_RNN_Graph_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from collections import Counter

In [None]:
with open('/content/drive/MyDrive/reviews.txt', 'r') as f:
  reviews=f.read()
with open('/content/drive/MyDrive/labels.txt', 'r') as f:
  labels=f.read()

In [None]:
type(reviews)

In [None]:
print(reviews[:2000].count('\n'))

In [None]:
print(reviews[:2000])

In [None]:
from string import punctuation

all_text=''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = set(all_text.split())

In [None]:
for i in words:
  print(i)


In [None]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [None]:
print(vocab_to_int['rockwell'])

In [None]:
print(counts.get())
"""key=counts.get is a key function used by the sorted function to determine the sort order.
 Specifically, it tells the sorted function to use the counts (frequencies)
  of the words as the key for sorting."""

In [None]:
reviews[:20]

In [None]:
reviews_ints  = []
for review in reviews:
  reviews_ints.append([vocab_to_int[word] for word in review.split()])


In [None]:

vocab_to_int[reviews[0].split()[0]]

In [None]:
reviews[0].split()[0]

In [None]:
vocab_to_int["bromwell"]

In [None]:
reviews_ints[0]

## **Encoding the Labels**

In [None]:
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [None]:
encoded_labels

In [None]:
review_len = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_len[0]))
print("Maximum review length: {}".format(max(review_len)))

In [None]:
# Filter out that review with 0 length
reviews_ints = [each for each in reviews_ints if len(each) > 0]

In [None]:
seq_len = 200
features = np.zeros((len(reviews_ints), seq_len), dtype=int)

In [None]:
features.shape

In [None]:
len(reviews_ints[2])

In [None]:
len(reviews_ints[2][:seq_len])

In [None]:
for i,row in enumerate(reviews_ints):
  features[i, -len(row):] = np.array(row)[:seq_len]

In [None]:
len(features[0,:] )

# **Training, Validation, Test**

In [None]:
split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))
## split data into training, validation, and test data (features and labels, x and y)

In [None]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

In [None]:
n_words = len(vocab)

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.compat.v1.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.compat.v1.placeholder(tf.int32, [None, None], name='labels')
    keep_prob =  tf.compat.v1.placeholder(tf.float32, name='keep_prob')

In [None]:
inputs_.shape

In [None]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300

with graph.as_default():
    embedding = tf.Variable(tf.random.uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [None]:
with graph.as_default():
    # Your basic LSTM cell
    lstm =tf.compat.v1.nn.rnn_cell.BasicLSTMCell(lstm_size)

    # Add dropout to the cell
    drop = tf.compat.v1.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob=keep_prob)

    # Stack up multiple LSTM layers, for deep learning
    cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell([drop] * lstm_layers)

    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [None]:
with graph.as_default():
    outputs, final_state = tf.compat.v1.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)

In [None]:
with graph.as_default():
    predictions = tf.keras.layers.Dense(1, activation='sigmoid')(outputs[:, -1])    #predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)

    cost = tf.losses.mean_squared_error(labels_, predictions)

    optimizer=  tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    train_step=optimizer.minimize(cost, var_list=[embedding] +  tf.compat.v1.trainable_variables()) # Pass the cost function and variables to minimize



In [None]:
epochs = 10
with graph.as_default():
    saver = tf.compat.v1.train.Saver()

with tf.compat.v1.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)

        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)

            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")