#Design of LSTM and GRU RNN for classification of IMDB reviews

### Step-1:

In [None]:
import pandas as pd    
import numpy as np    
from nltk.corpus import stopwords   
from sklearn.model_selection import train_test_split      
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences   
from tensorflow.keras.models import Sequential    
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU
from tensorflow.keras.callbacks import ModelCheckpoint 
from tensorflow.keras.models import load_model   
import re

In [None]:
data = pd.read_csv('/content/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
import nltk
nltk.download('stopwords')
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Step-2:

In [None]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']      
    y_data = df['sentiment']    

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops]) 
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


### Step-3:

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.4)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
27950    [absolute, masterpiece, film, goodnight, mr, t...
20530    [countenance, antoine, monot, copycat, imperso...
8389     [i, recently, saw, blind, spot, coyoacan, drew...
17509    [this, film, pretty, poor, the, acting, abysma...
46320    [i, cannot, believe, film, i, like, i, usually...
                               ...                        
13625    [i, looking, forward, seeing, bruce, willis, e...
1147     [prison, often, brought, conversations, best, ...
36009    [watching, preview, armored, i, thought, movie...
19023    [david, lean, worst, film, even, in, which, we...
39135    [like, previous, commentator, film, i, found, ...
Name: review, Length: 30000, dtype: object 

23036    [may, contain, spoilers, the, titular, topless...
3856     [one, oddest, strikingly, eerie, creepy, horro...
49589    [as, member, cast, i, member, band, basketball...
11280    [oh, my, god, idiotic, completely, pointless, ...
43509    [tu, pa, tam, one, worst, movies, i, ever, see...
 

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 1537   798     4 ...     0     0     0]
 [26367 10313 50841 ...     0     0     0]
 [    1   936   117 ...     0     0     0]
 ...
 [   65  4079 18720 ...     0     0     0]
 [  484  5896   158 ...     0     0     0]
 [    6   822  9274 ...  3165  2383    34]] 

Encoded X Test
 [[  107  2819   955 ...  2440     0     0]
 [    5 19126 10773 ...  8981 13868   306]
 [  108  1498    88 ...     0     0     0]
 ...
 [ 3489 18184  2066 ...  2224    68  1860]
 [    1  1571    19 ...     0     0     0]
 [10487  3264     9 ...  1038   137   178]] 

Maximum review length:  130


### Step-4:

In [None]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))#input_lenght = review oda len
model.add(LSTM(LSTM_OUT))                                              #total_words = no .of unique words
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2627264   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,654,209
Trainable params: 2,654,209
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
model.fit(x_train, y_train, batch_size = 128, epochs = 10, callbacks=[checkpoint])

Epoch 1/10
Epoch 1: accuracy improved from -inf to 0.71403, saving model to models/LSTM.h5
Epoch 2/10
Epoch 2: accuracy improved from 0.71403 to 0.92153, saving model to models/LSTM.h5
Epoch 3/10
Epoch 3: accuracy improved from 0.92153 to 0.96503, saving model to models/LSTM.h5
Epoch 4/10
Epoch 4: accuracy improved from 0.96503 to 0.98050, saving model to models/LSTM.h5
Epoch 5/10
Epoch 5: accuracy improved from 0.98050 to 0.98843, saving model to models/LSTM.h5
Epoch 6/10
Epoch 6: accuracy improved from 0.98843 to 0.99150, saving model to models/LSTM.h5
Epoch 7/10
Epoch 7: accuracy improved from 0.99150 to 0.99397, saving model to models/LSTM.h5
Epoch 8/10
Epoch 8: accuracy did not improve from 0.99397
Epoch 9/10
Epoch 9: accuracy did not improve from 0.99397
Epoch 10/10
Epoch 10: accuracy did not improve from 0.99397


<keras.callbacks.History at 0x7f2960c1e4d0>

In [None]:
model.evaluate(x_test, y_test)



[0.9052920937538147, 0.841949999332428]

Step-5:

In [None]:
model1 = Sequential()
model1.add(Embedding(total_words, 32, input_length = max_length))
model1.add(LSTM(32))
model1.add(Dense(32, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model1.fit(x_train, y_train, batch_size = 128, epochs = 10, callbacks=[checkpoint])
model1.evaluate(x_test, y_test)
print(model1.summary())

Epoch 1/10
Epoch 1: accuracy did not improve from 0.99397
Epoch 2/10
Epoch 2: accuracy did not improve from 0.99397
Epoch 3/10
Epoch 3: accuracy did not improve from 0.99397
Epoch 4/10
Epoch 4: accuracy did not improve from 0.99397
Epoch 5/10
Epoch 5: accuracy did not improve from 0.99397
Epoch 6/10
Epoch 6: accuracy did not improve from 0.99397
Epoch 7/10
Epoch 7: accuracy did not improve from 0.99397
Epoch 8/10
Epoch 8: accuracy did not improve from 0.99397
Epoch 9/10
Epoch 9: accuracy improved from 0.99397 to 0.99557, saving model to models/LSTM.h5
Epoch 10/10
Epoch 10: accuracy improved from 0.99557 to 0.99713, saving model to models/LSTM.h5
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 130, 32)           2627264   
                                                                 
 lstm_3 (LSTM)               (None, 32)                8320 

### Step-6:

In [None]:
model2 = Sequential()
model2.add(Embedding(total_words, 32, input_length = max_length))
model2.add(Bidirectional(LSTM(32)))
model2.add(Dense(32, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model2.fit(x_train, y_train, batch_size = 128, epochs = 10)
model2.evaluate(x_test, y_test)
print(model2.summary())

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 130, 32)           2627264   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,646,017
Trainable params: 2,646,017
Non-trainable params: 0
_________________________________________________________________

In [None]:
model2.evaluate(x_test, y_test)



[0.8363948464393616, 0.8522999882698059]

In [None]:
from keras.preprocessing.text import Tokenizer

t  = Tokenizer()
# Defining 4 document lists
fit_text = ['Machine Learning Knowledge',
	    'Machine Learning',
            'Deep Learning',
            'Artificial Intelligence']
t.fit_on_texts(fit_text)


None


In [None]:
from keras.preprocessing.text import Tokenizer

t  = Tokenizer()
# Defining 4 document lists
fit_text = ['Machine Learning Knowledge',
	    'Machine Learning',
            'Deep Learning',
            'Artificial Intelligence']