<a href="https://colab.research.google.com/github/waqqasansari/Natural_Language_Processing/blob/master/SentimentAnsalysis_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U trax

In [None]:
import string
import re
import os
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from nltk.stem import PorterStemmer
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import trax
import os 
import random as rnd
import trax.fastmath.numpy as np
from trax import layers as tl

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 


In [None]:
def load_tweets():
  all_positive_tweets = twitter_samples.strings('positive_tweets.json')
  all_negative_tweets = twitter_samples.strings('negative_tweets.json')
  return all_positive_tweets, all_negative_tweets

In [None]:
def process_tweet(tweet):
  tweet = re.sub(r'\$\w*', '', tweet)
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  tweet = re.sub(r'#', '', tweet)

  tokenizer = TweetTokenizer(preserve_case=False, 
                             strip_handles=True, 
                             reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)
  tweets_clean = []
  for word in tweet_tokens:
    if (word not in stopwords_english and
        word not in string.punctuation):
      stem_word = stemmer.stem(word)
      tweets_clean.append(stem_word)

  return tweets_clean

In [None]:
all_positive_tweets, all_negative_tweets = load_tweets()
print(f"The number of positive tweets: {len(all_positive_tweets)}")
print(f"The number of negative tweets: {len(all_negative_tweets)}")

val_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]

val_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg

val_x = val_pos + val_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

val_y = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

The number of positive tweets: 5000
The number of negative tweets: 5000
length of train_x 8000
length of val_x 2000


In [None]:
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2}

for tweet in train_x:
  processed_tweet = process_tweet(tweet)
  for word in processed_tweet:
    if word not in Vocab:
      Vocab[word] = len(Vocab)

print("Total words in vocab are",len(Vocab))
display(Vocab)

Total words in vocab are 9092


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'followfriday': 3,
 'top': 4,
 'engag': 5,
 'member': 6,
 'commun': 7,
 'week': 8,
 ':)': 9,
 'hey': 10,
 'jame': 11,
 'odd': 12,
 ':/': 13,
 'pleas': 14,
 'call': 15,
 'contact': 16,
 'centr': 17,
 '02392441234': 18,
 'abl': 19,
 'assist': 20,
 'mani': 21,
 'thank': 22,
 'listen': 23,
 'last': 24,
 'night': 25,
 'bleed': 26,
 'amaz': 27,
 'track': 28,
 'scotland': 29,
 'congrat': 30,
 'yeaaah': 31,
 'yipppi': 32,
 'accnt': 33,
 'verifi': 34,
 'rqst': 35,
 'succeed': 36,
 'got': 37,
 'blue': 38,
 'tick': 39,
 'mark': 40,
 'fb': 41,
 'profil': 42,
 '15': 43,
 'day': 44,
 'one': 45,
 'irresist': 46,
 'flipkartfashionfriday': 47,
 'like': 48,
 'keep': 49,
 'love': 50,
 'custom': 51,
 'wait': 52,
 'long': 53,
 'hope': 54,
 'enjoy': 55,
 'happi': 56,
 'friday': 57,
 'lwwf': 58,
 'second': 59,
 'thought': 60,
 '’': 61,
 'enough': 62,
 'time': 63,
 'dd': 64,
 'new': 65,
 'short': 66,
 'enter': 67,
 'system': 68,
 'sheep': 69,
 'must': 70,
 'buy':

In [None]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
  word_l = process_tweet(tweet)

  if verbose:
    print("List of words from the processed tweet:")
    print(world_l)

  tensor_l = []

  unk_ID = vocab_dict[unk_token]

  if verbose:
    print(f"The unique integer ID for the unk_token is {unk_ID}")

  for word in word_l:
    word_ID = vocab_dict.get(word, unk_ID)

    tensor_l.append(word_ID)

  return tensor_l

In [None]:
print("Actual tweet is\n", val_pos[0])
print("\nTensor of tweet:\n", tweet_to_tensor(val_pos[0], vocab_dict=Vocab))

Actual tweet is
 Bro:U wan cut hair anot,ur hair long Liao bo
Me:since ord liao,take it easy lor treat as save $ leave it longer :)
Bro:LOL Sibei xialan

Tensor of tweet:
 [1065, 136, 479, 2351, 745, 8146, 1123, 745, 53, 2, 2672, 791, 2, 2, 349, 601, 2, 3489, 1017, 597, 4559, 9, 1065, 157, 2, 2]


In [None]:
def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
  
  # make sure the batch size is an even number
  assert batch_size % 2 == 0

  # Number of positive examples in each batch is half of the batch size
  # same with number of negative examples in each batch
  n_to_take = batch_size // 2

  pos_index = 0
  neg_index = 0

  len_data_pos = len(data_pos)
  len_data_neg = len(data_neg)

  pos_index_lines = list(range(len_data_pos))
  neg_index_lines = list(range(len_data_neg))

  if shuffle:
    rnd.shuffle(pos_index_lines)
    rnd.shuffle(neg_index_lines)

  stop = False

  while not stop:

    batch = []

    for i in range(n_to_take):
      if pos_index >= len_data_pos:
        
        if not loop:
          stop = True
          break

        pos_index = 0

        if shuffle:
          rnd.shuffle(pos_index_lines)

      tweet = data_pos[pos_index_lines[pos_index]]
      tensor = tweet_to_tensor(tweet, vocab_dict)
      batch.append(tensor)
      pos_index = pos_index + 1


    for i in range(n_to_take):
      if neg_index >= len_data_neg:
        if not loop:
          stop = True
          break

        neg_index = 0
        if shuffle:
          rnd.shuffle(neg_index_lines)

      tweet = data_neg[neg_index_lines[neg_index]]
      tensor = tweet_to_tensor(tweet, vocab_dict)
      batch.append(tensor)
      neg_index = neg_index + 1

    if stop:
      break

    pos_index += n_to_take
    neg_index += n_to_take

    max_len = max([len(t) for t in batch])

    tensor_pad_l = []

    for tensor in batch:
      n_pad = max_len - len(tensor)
      pad_l = [0] * n_pad
      tensor_pad = tensor + pad_l
      tensor_pad_l.append(tensor_pad)

    inputs = np.array(tensor_pad_l)
    target_pos = [1] * (n_to_take)
    target_neg = [0] * (n_to_take)
    target_l = target_pos + target_neg
    targets = np.array(target_l)
    example_weights = np.ones_like(targets)

    yield inputs, targets, example_weights

In [None]:
rnd.seed(30) 

def train_generator(batch_size, shuffle = False):
    return data_generator(train_pos, train_neg, batch_size, True, Vocab, shuffle)


def val_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, True, Vocab, shuffle)


def test_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, False, Vocab, shuffle)

# Get a batch from the train_generator and inspect.
inputs, targets, example_weights = next(train_generator(4, shuffle=True))

print(f'Inputs: {inputs}')
print(f'Targets: {targets}')
print(f'Example Weights: {example_weights}')

Inputs: [[2005 4451 3201    9    0    0    0    0    0    0    0]
 [4954  567 2000 1454 5174 3499  141 3499  130  459    9]
 [3761  109  136  583 2930 3969    0    0    0    0    0]
 [ 250 3761    0    0    0    0    0    0    0    0    0]]
Targets: [1 1 0 0]
Example Weights: [1 1 1 1]


In [None]:
class Layer(object):
  def __init__(self):
      self.weights = None

  def forward(self, x):
      raise NotImplementedError

  def init_weights_and_state(self, input_signature, random_key):
      pass

  def init(self, input_signature, random_key):
      self.init_weights_and_state(input_signature, random_key)
      return self.weights
  
  def __call__(self, x):
      return self.forward(x)

In [None]:
class ReLu(Layer):
  def forward(self, x):
    activation = np.maximum(x, 0)

    return activation

In [None]:
from trax import fastmath
np = fastmath.numpy
random = fastmath.random

In [None]:
class Dense(Layer):
  def __init__(self, n_units, init_stdev=0.1):
    self._n_units = n_units
    self._init_stdev = init_stdev
  def forward(self, x):
    dense = np.dot(x, self.weights)
    return dense

  def init_weights_and_state(self, input_signature, random_key):
    input_shape = input_signature.shape
    w = self._init_stdev * trax.fastmath.random.normal(key=random_key,  
                                                      shape = (input_shape[-1], self._n_units))
    self.weights = w
    return self.weights

In [None]:
# Testing your Dense layer 
dense_layer = Dense(n_units=10)  #sets  number of units in dense layer
random_key = random.get_prng(seed=0)  # sets random seed
z = np.array([[2.0, 7.0, 25.0]]) # input array 

dense_layer.init(z, random_key)
print("Weights are\n ",dense_layer.weights) #Returns randomly generated weights
print("Foward function output is ", dense_layer(z)) # Returns multiplied values of units and weights

Weights are
  [[-0.02837107  0.09368163 -0.10050073  0.14165013  0.10543301  0.09108127
  -0.04265671  0.0986188  -0.05575324  0.0015325 ]
 [-0.2078568   0.05548371  0.09142365  0.05744596  0.07227863  0.01210618
  -0.03237354  0.16234998  0.02450039 -0.13809781]
 [-0.06111237  0.01403725  0.08410043 -0.10943579 -0.1077502  -0.11396457
  -0.0593338  -0.01557651 -0.03832145 -0.11144515]]
Foward function output is  [[-3.0395489   0.92668045  2.5414748  -2.0504727  -1.9769385  -2.5822086
  -1.7952732   0.94427466 -0.89803994 -3.7497485 ]]


In [None]:
def classifier(vocab_size=len(Vocab), embedding_dim=256, output_dim=2, mode='train'):
  embed_layer = tl.Embedding(
      vocab_size=vocab_size,
      d_feature=embedding_dim)
  
  mean_layer = tl.Mean(axis=1)
  dense_output_layer = tl.Dense(n_units = output_dim)
  log_softmax_layer = tl.LogSoftmax()

  model = tl.Serial(
      embed_layer,
      mean_layer,
      dense_output_layer,
      log_softmax_layer)
  
  return model

In [None]:
from trax.supervised import training

batch_size = 16
rnd.seed(271)

train_task =  training.TrainTask(
    labeled_data = train_generator(batch_size=batch_size, shuffle=True),
    loss_layer = tl.CrossEntropyLoss(),
    optimizer = trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint = 10)

eval_task = training.EvalTask(
    labeled_data=val_generator(batch_size=batch_size, shuffle=True),
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()])

model = classifier()

In [None]:
output_dir_expand = os.path.expanduser('~/output_dir/')
!rm -rf {output_dir_expand}

In [None]:
def train_model(classifier, train_task, eval_task, n_steps, output_dir):
  training_loop = training.Loop(
                                classifier, # The learning model
                                train_task, # The training task
                                eval_tasks = [eval_task],# The evaluation task
                                output_dir = output_dir) # The output directory
  
  training_loop.run(n_steps = n_steps)

  return training_loop

In [None]:
training_loop = train_model(model, train_task, eval_task, 100, output_dir_expand)


Step      1: Ran 1 train steps in 2.25 secs
Step      1: train CrossEntropyLoss |  1.26101446
Step      1: eval  CrossEntropyLoss |  0.77992326
Step      1: eval          Accuracy |  0.43750000

Step     10: Ran 9 train steps in 3.78 secs
Step     10: train CrossEntropyLoss |  0.78989428
Step     10: eval  CrossEntropyLoss |  0.70014697
Step     10: eval          Accuracy |  0.50000000

Step     20: Ran 10 train steps in 2.23 secs
Step     20: train CrossEntropyLoss |  0.46327835
Step     20: eval  CrossEntropyLoss |  0.36245325
Step     20: eval          Accuracy |  0.87500000

Step     30: Ran 10 train steps in 1.76 secs
Step     30: train CrossEntropyLoss |  0.30180237
Step     30: eval  CrossEntropyLoss |  0.23907390
Step     30: eval          Accuracy |  1.00000000

Step     40: Ran 10 train steps in 1.18 secs
Step     40: train CrossEntropyLoss |  0.26077047
Step     40: eval  CrossEntropyLoss |  0.35614920
Step     40: eval          Accuracy |  0.68750000

Step     50: Ran 10 t

In [None]:
def compute_accuracy(preds, y, y_weights):
  is_pos = preds[:, 1] > preds[:, 0]
  is_pos_int = is_pos.astype(np.int32)
  correct = is_pos_int == y
  correct_float = correct.astype(np.float32)
  sum_weights = np.sum(y_weights)
  weighted_correct_float = correct_float * y_weights
  weighted_num_correct = np.sum(weighted_correct_float)
  accuracy = weighted_num_correct / sum_weights

  return accuracy, weighted_num_correct, sum_weights

In [None]:
def test_model(generator, model):

  accuracy = 0
  total_num_correct = 0
  total_num_pred = 0

  for batch in generator:
    inputs = batch[0]
    targets = batch[1]
    example_weight = batch[2]

    pred = model(inputs)

    batch_accuracy, batch_num_correct, batch_num_pred = compute_accuracy(pred, 
                                                                         targets, 
                                                                         example_weight)
    total_num_correct += batch_num_correct
    total_num_pred += batch_num_pred

  accuracy = total_num_correct / total_num_pred
  return accuracy

In [None]:
model = training_loop.eval_model
accuracy = test_model(test_generator(16), model)

print(f'The accuracy of your model on the validation set is {accuracy:.4f}', )

The accuracy of your model on the validation set is 0.9940


In [None]:
def predict(sentence):
  inputs = np.array(tweet_to_tensor(sentence, vocab_dict=Vocab))
  inputs = inputs[None, :]
  preds_probs = model(inputs)
  preds = int(preds_probs[0, 1] > preds_probs[0, 0])
  
  sentiment = "negative"
  if preds == 1:
    sentiment = "positive"

  return preds, sentiment

In [None]:
# try a positive sentence
sentence = "It's such a nice day, think i'll be taking Sid to Ramsgate fish and chips for lunch at Peter's fish factory and then the beach maybe"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

print()
# try a negative sentence
sentence = "I hated my day, it was the worst, I'm so sad."
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

The sentiment of the sentence 
***
"It's such a nice day, think i'll be taking Sid to Ramsgate fish and chips for lunch at Peter's fish factory and then the beach maybe"
***
is positive.

The sentiment of the sentence 
***
"I hated my day, it was the worst, I'm so sad."
***
is negative.


In [None]:
# try a positive sentence
sentence = "It's such a nice day"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

print()
# try a negative sentence
sentence = "That movie was garbage"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

The sentiment of the sentence 
***
"It's such a nice day"
***
is positive.

The sentiment of the sentence 
***
"That movie was garbage"
***
is negative.
