In [15]:
import tensorflow as tf
from keras.datasets import imdb
from keras.preprocessing import sequence
import os
import numpy as np 

VOCAB_SIZE = 88584

MAX_LEN = 250
BATCH_SIZE = 64

(train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words= VOCAB_SIZE)

The IMDB movie review from keras contains 25 000 reviews from IMDB where each on is already preprocesses and has a label as either positive or negative. Each review is encoded by integers that represents how common a word is in the entire dataset. For example, if a word is encoded by the integer 3, it means that it is the 3rd most common word in the dataset.

In [16]:
print(len(train_data[0]))
print(len(train_data[1]))

218
189


Our reviews have different lenghts which is an issue. We cannot pass different length data into our neural network.
- If the review is greater than 250 words then we trim off extra words
- If the review is less than 250 we add necessary amount of 0's to make it equal to 250.
We pad the reviews.

In [17]:
train_data = sequence.pad_sequences(train_data, MAX_LEN)
test_data = sequence.pad_sequences(test_data, MAX_LEN)
print(len(train_data[0]))
print(len(train_data[1]))

250
250


### Creating the model
We'll use a word embedding layer as the first layer in our model and add a LSTM layer afterwards that feeds into a dense node to get our predicted sentiment

32 stands for the output dimension of the vectors generated by the embedding layer.

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [19]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          2834688   
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2843041 (10.85 MB)
Trainable params: 2843041 (10.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["acc"])
history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
results = model.evaluate(test_data, test_labels)
print(results)

[0.4974336624145508, 0.8568000197410583]


Since the data has been preprocessed when we gave it to our model this means we need to process anything we want to make a prediction on in the exact same way. Same lookup table, encode it precisely the same. If we don't, the model is going to think it is looking at different words and will not make an accurate prediction.

In [22]:
word_index = imdb.get_word_index()

def encode_text(text):
    tokens = tf.keras.preprocessing.text.text_to_word_sequence(text) # tokens = individual words
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    
    return sequence.pad_sequences([tokens],MAX_LEN)[0]

text = "the movie was so amazing"
encoded = encode_text(text=text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   1  1

In [23]:
# Decoding

reverse_word_index = {value: key for (key,value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + " "
    
    return text[:-1]

print(decode_integers(encoded))

the movie was so amazing


In [25]:
# prediction

def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1,250))
    pred[0] = encoded_text
    result = model.predict(pred)
    print(result[0])

positive_review = "Wow the movie was so amazing, I loved it!"
predict(positive_review)

negative_review = "The movie was terrible, I will never watch this again!" # very bad model, gives too high values lmao.
predict(negative_review)

[0.84790605]
[0.79853654]
