Importing necessary libraries

In [1]:
import tensorflow as tf

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow.keras as keras

Importing the data files

In [3]:
imdb_reviews=pd.read_csv('imdb_reviews.csv')
test_reviews=pd.read_csv('test_reviews.csv')

In [4]:
imdb_reviews.head()

Unnamed: 0,Reviews,Sentiment
0,<START this film was just brilliant casting lo...,positive
1,<START big hair big boobs bad music and a gian...,negative
2,<START this has to be one of the worst films o...,negative
3,<START the <UNK> <UNK> at storytelling the tra...,positive
4,<START worst mistake of my life br br i picked...,negative


In [5]:
test_reviews.head()

Unnamed: 0,Reviews,Sentiment
0,<START please give this one a miss br br <UNK>...,negative
1,<START this film requires a lot of patience be...,positive
2,<START many animation buffs consider <UNK> <UN...,positive
3,<START i generally love this type of movie how...,negative
4,<START like some other people wrote i'm a die ...,positive


Preprocessing of the data

In [6]:
word_index=pd.read_csv('word_indexes.csv')

In [7]:
word_index.head()

Unnamed: 0,Words,Indexes
0,tsukino,52009
1,nunnery,52010
2,sonja,16819
3,vani,63954
4,woods,1411


convert word indexes into a python dictionary,conversion from string to integer format so we can feed it to machine learning model.

In [8]:
word_index=dict(zip(word_index.Words,word_index.Indexes))

In [9]:
word_index["<PAD>"]=0
word_index["<START"]=1
word_index["<UNK>"]=2
word_index["<UNUSED>"]=3

Create a function that encodes reviews into integer format

In [10]:
def review_encoder(text):
    arr=[word_index[word] for word in text]
    return arr

Before training the model we perform train test split 

In [11]:
train_data,train_labels=imdb_reviews['Reviews'],imdb_reviews['Sentiment']
test_data,test_labels=test_reviews['Reviews'],test_reviews['Sentiment']

In [12]:
train_data=train_data.apply(lambda review:review.split())
test_data=test_data.apply(lambda review:review.split())

In [13]:
train_data=train_data.apply(review_encoder)
test_data=test_data.apply(review_encoder)

In [14]:
# now check our train dataset

train_data.head()

0    [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ...
1    [1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463,...
2    [1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5...
3    [1, 4, 2, 2, 33, 2804, 4, 2040, 432, 111, 153,...
4    [1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1...
Name: Reviews, dtype: object

Now encode positive sentiment as 1 and negative sentiment as 0

In [20]:
def encode_sentiments(x):
  if x=='positive':
      return 1
  else:
      return 0

In [21]:
train_labels=train_labels.apply(encode_sentiments)
test_labels=test_labels.apply(encode_sentiments)

In [22]:
train_data=keras.preprocessing.sequence.pad_sequences(train_data,value=word_index["<PAD>"],padding='post',maxlen=500)
test_data=keras.preprocessing.sequence.pad_sequences(test_data,value=word_index["<PAD>"],padding='post',maxlen=500)

Now its time to build the model using neural networks

In [24]:
model=keras.Sequential([keras.layers.Embedding(10000,16,input_length=500),
                        keras.layers.GlobalAveragePooling1D(),
                        keras.layers.Dense(16,activation='relu'),
                        keras.layers.Dense(1,activation='sigmoid')])

In [25]:
#compile the model using adam optimizer
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [26]:
#training the model
history=model.fit(train_data,train_labels,epochs=30,batch_size=512,validation_data=(test_data,test_labels))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


now its time to evaluate loss and accuracy on test data

In [27]:
loss,accuracy=model.evaluate(test_data,test_labels)



Our model is giving accuracy of 88.56% on testing data.

In [28]:
#now take a random input from test data and check whether our model give correct output

index=np.random.randint(1,1000)
user_review=test_reviews.loc[index]
print(user_review)

Reviews      <START i think it was a pretty good film it sh...
Sentiment                                             positive
Name: 824, dtype: object


In [29]:
user_review=test_data[index]
user_review=np.array([user_review])
if (model.predict(user_review)>0.5).astype("int32"):
  print("positive sentiment")
else:
  print("negative sentiment")


positive sentiment


From these above two results we can say our model can correctly predict the sentiment of the review.