# Machine Learning - Sentiment Analysis IMDb Dataset (using LSTM, GRU)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame()
df = pd.read_csv('/content/sample_data/movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [3]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [4]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [5]:
# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

Training data: 
(50000,)
(50000,)


In [6]:
import tensorflow as tf
import tensorflow.keras as keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [11]:
tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews) 

# pad sequences
max_length = 100 # try other options like mean
# define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1
print(tokenizer_obj.word_index)

X_train_tokens =  tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

#print(X_train_tokens[0])
#print(X_test_tokens[0])

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')



In [12]:
print(vocab_size)

125602


In [13]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers import Embedding

EMBEDDING_DIM = 100

print('Build model...')

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Build model...
Summary of the built model...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          12560200  
                                                                 
 gru (GRU)                   (None, 32)                12864     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 12,573,097
Trainable params: 12,573,097
Non-trainable params: 0
_________________________________________________________________
None


In [28]:
print('Train...')

model.fit(X_train_pad, y_train, batch_size=128, epochs=5, validation_data=(X_test_pad, y_test), verbose=2)

Train...
Epoch 1/5
196/196 - 89s - loss: 0.6936 - accuracy: 0.4990 - val_loss: 0.6937 - val_accuracy: 0.4967 - 89s/epoch - 455ms/step
Epoch 2/5
196/196 - 89s - loss: 0.6743 - accuracy: 0.5939 - val_loss: 0.7148 - val_accuracy: 0.5021 - 89s/epoch - 454ms/step
Epoch 3/5
196/196 - 90s - loss: 0.5801 - accuracy: 0.6809 - val_loss: 0.7838 - val_accuracy: 0.5049 - 90s/epoch - 459ms/step
Epoch 4/5
196/196 - 85s - loss: 0.4590 - accuracy: 0.7626 - val_loss: 0.8790 - val_accuracy: 0.5050 - 85s/epoch - 433ms/step
Epoch 5/5
196/196 - 84s - loss: 0.3620 - accuracy: 0.8201 - val_loss: 1.0537 - val_accuracy: 0.5064 - 84s/epoch - 430ms/step


<keras.callbacks.History at 0x7f44a1251730>

In [29]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

Testing...
Test score: 1.053746223449707
Test accuracy: 0.5063999891281128
Accuracy: 50.64%


In [30]:
#Let us test some  samples
test_sample_1 = "This movie is fantastic! I really like it because it is so good!"
test_sample_2 = "bad movie!"
test_sample_3 = "Maybe I like this movie."
test_sample_4 = "Not to my taste, will skip and watch another movie"
test_sample_5 = "if you like action, then this movie might be good for you."
test_sample_6 = "Bad movie!"
test_sample_7 = "Not a good movie!"
test_sample_8 = "This movie really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)

#predict
model.predict(x=test_samples_tokens_pad)



array([[0.3237262 ],
       [0.447253  ],
       [0.53706956],
       [0.48205608],
       [0.6277348 ],
       [0.447253  ],
       [0.47621563],
       [0.870719  ]], dtype=float32)

In [31]:
#let us check how the model predicts
classes = model.predict(X_test_pad[:10], batch_size=128)
for i in range (0,10):
    if(classes[i] > 0.5 and y_test[i] == 1 or (classes[i] <= 0.5 and y_test[i] == 0)):
        print( classes[i], y_test[i], " Right prdiction")
    else :
        print( classes[i], y_test[i], " Wrong prdiction")

[0.72956336] 0  Wrong prdiction
[0.3953858] 1  Wrong prdiction
[0.20693061] 1  Wrong prdiction
[0.6774869] 0  Wrong prdiction
[0.40631208] 1  Wrong prdiction
[0.2742338] 1  Wrong prdiction
[0.47846302] 1  Wrong prdiction
[0.01000124] 0  Right prdiction
[0.93833506] 0  Wrong prdiction
[0.2622901] 1  Wrong prdiction


In [32]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.utils import pad_sequences

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

max_words = 500
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)

print('Build model...')

model = Sequential()
model.add(Embedding(top_words, 100, input_length=max_words))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Build model...
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 100)          500000    
                                                                 
 lstm_1 (LSTM)               (None, 32)                17024     
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 517,057
Trainable params: 517,057
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
print('Train...')

model.fit(X_train, y_train, batch_size=128, epochs=5, validation_data=(X_test, y_test), verbose=2)

Train...
Epoch 1/5
196/196 - 262s - loss: 0.4594 - accuracy: 0.7800 - val_loss: 0.3169 - val_accuracy: 0.8675 - 262s/epoch - 1s/step
Epoch 2/5


In [None]:
score, acc = model.evaluate(X_test, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: %.2f%%" % (acc*100))

Test score: 0.5993069805335999
Test accuracy: 0.839160000038147
Accuracy: 83.92%


The time to train a GRU is less than LSTM network.

In [24]:
#Let us test some  samples
test_sample_1 = "This movie is fantastic! I really like it because it is so good!"
test_sample_2 = "Good movie!"
test_sample_3 = "Maybe I like this movie."
test_sample_4 = "Not to my taste, will skip and watch another movie"
test_sample_5 = "if you like action, then this movie might be good for you."
test_sample_6 = "Bad movie!"
test_sample_7 = "Not a good movie!"
test_sample_8 = "This movie really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)

#predict
model.predict(x=test_samples_tokens_pad)

ValueError: ignored