In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk
import re
from tensorflow.keras import models,layers
from nltk.corpus import stopwords 
from nltk import RegexpTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

from google.colab import drive
drive.mount('/content/gdrive')
nltk.download('stopwords')

(a). Preprocessing the data , encoding the data and pading.

In [None]:
train_data = open('/content/gdrive/My Drive/DeepLearningProjects/SentimentAnalysisTrainData.txt').readlines()
test_data = open('/content/gdrive/My Drive/DeepLearningProjects/SentimentAnalysisTestData.txt').readlines()
stop_words = set(stopwords.words('english')) 

def clean_Data(data):
  clean_sentences=[]
  clean = re.compile('<.*?>')
  for i in range(len(data)):
      data[i] =  re.sub(clean, '', data[i])
  tokenizer  = RegexpTokenizer(r'\w+')
  for text in data:
    tokens = tokenizer.tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    clean_sentences.append(tokens)
  clean_data = []
  for text in clean_sentences:
    sent = " ".join(text)
    clean_data.append(sent)
  return clean_data

def encode(train_data,test_data):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(train_data+test_data)
  train_encoded = tokenizer.texts_to_sequences(train_data)
  test_encoded = tokenizer.texts_to_sequences(test_data)
  padded_train = tf.keras.preprocessing.sequence.pad_sequences(train_encoded,padding='post',maxlen=200)
  padded_test = tf.keras.preprocessing.sequence.pad_sequences(test_encoded,padding='post',maxlen=200)
  vocab_size = len(tokenizer.word_index)+1 
  return (vocab_size,padded_train,padded_test)

vocab_size,x_train,x_test = encode(clean_Data(train_data),clean_Data(test_data))

(b). Creating the target vectors for training and test data.

In [None]:
y_train         = [0]*25000
y_train[:12500] = [1]*12500
y_test          = [0]*25000
y_test[:12500]  = [1]*12500

(c). Creating a validation set which is 20% of training data

In [None]:
#This can done while fitting the data using the validation split argument.

(d). Training the following models:

(e). **MODEL1**

In [None]:
cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(num_units=200,activation='tanh')
model1 = models.Sequential()
model1.add(layers.Embedding(vocab_size,128))
model1.add(layers.RNN(cell=cell))
model1.add(layers.Dense(1,activation='sigmoid'))
model1.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model1.fit(x_train,y_train,epochs=10,batch_size=30,validation_split=0.2,shuffle=True)

(f). **MODEL2**

In [None]:
model2 = models.Sequential()
model2.add(layers.Embedding(vocab_size,128))
model2.add(layers.LSTM(units=200,activation='tanh'))
model2.add(layers.Dense(1,activation='sigmoid'))
model2.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model2.fit(x_train,y_train,epochs=10,batch_size=30,validation_split=0.2,shuffle=True)

(g). **MODEL3**

In [None]:
model3 = models.Sequential()
model3.add(layers.Embedding(vocab_size,128))
model3.add(layers.GRU(units=200,activation='relu'))
model3.add(layers.Dense(1,activation='sigmoid'))
model3.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model3.fit(x_train,y_train,epochs=10,batch_size=30,validation_split=0.2,shuffle=True)

(h). Among the models 1,2,3 the model2 is best performing as it has low loss and higher accuracy compared to others

In [None]:
(loss1,accuracy1) = model1.evaluate(x_test,y_test)
print("model1_loss:",loss1,"model1_accuracy:",accuracy1)
(loss2,accuracy2) = model2.evaluate(x_test,y_test)
print("model2_loss:",loss2,"model2_accuracy:",accuracy2)
(loss3,accuracy3) = model3.evaluate(x_test,y_test)
print("model3_loss:",loss3,"model3_accuracy:",accuracy3)

(i). **MODEL4**

In [None]:
model4 = models.Sequential()
model4.add(layers.Embedding(vocab_size,128))
model4.add(layers.LSTM(units=200,return_sequences=True,activation='tanh'))
model4.add(layers.LSTM(units=200,activation='tanh'))
model4.add(layers.Dense(1,activation='sigmoid'))
model4.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model4.fit(x_train,y_train,epochs=5,batch_size=30,validation_split=0.2,shuffle=True)

(j). **MODEL5**

In [None]:
model5 = models.Sequential()
model5.add(layers.Embedding(vocab_size,128))
model5.add(layers.LSTM(units=200,return_sequence=True,activation='tanh'))
model5.add(layers.LSTM(units=200,activation='tanh'))
model5.add(layers.Dense(1,activation='sigmoid'))
model5.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model5.fit(x_train,y_train,epochs=5,batch_size=30,validation_split=0.2,shuffle=True)

(l). Ploting loss vs iteration & accuracy vs iteration curves for training data for all the models.

In [None]:
#model1
(fig,axs) = plt.subplots(nrows=1,ncols=2)
ax  = axs[0]
ax.plot(model1.history.history['acc'])
ax.set_title("accuracy vs iterations")
ax = axs[1]
ax.plot(model1.history.history['loss'])
ax.set_title("loss vs iterations")
fig.suptitle("Model1")
fig.show()
#model2
(fig,axs) = plt.subplots(nrows=1,ncols=2)
ax  = axs[0]
ax.plot(model2.history.history['acc'])
ax.set_title("accuracy vs iterations")
ax = axs[1]
ax.plot(model2.history.history['loss'])
ax.set_title("loss vs iterations")
fig.suptitle("Model2")
fig.show()
#model3
(fig,axs) = plt.subplots(nrows=1,ncols=2)
ax  = axs[0]
ax.plot(model3.history.history['acc'])
ax.set_title("accuracy vs iterations")
ax = axs[1]
ax.plot(model3.history.history['loss'])
ax.set_title("loss vs iterations")
fig.suptitle("Model3")
fig.show()
#model4
(fig,axs) = plt.subplots(nrows=1,ncols=2)
ax  = axs[0]
ax.plot(model4.history.history['acc'])
ax.set_title("accuracy vs iterations")
ax = axs[1]
ax.plot(model4.history.history['loss'])
ax.set_title("loss vs iterations")
fig.suptitle("Model4")
fig.show()
#model5
(fig,axs) = plt.subplots(nrows=1,ncols=2)
ax  = axs[0]
ax.plot(model5.history.history['acc'])
ax.set_title("accuracy vs iterations")
ax = axs[1]
ax.plot(model5.history.history['loss'])
ax.set_title("loss vs iterations")
fig.suptitle("Model5")
fig.show()