In [1]:
import tensorflow as tf
print("tf version - {}".format(tf.__version__))
from tensorflow import keras
import numpy as np
print("numpy version - {}".format(np.__version__))

tf version - 2.0.0
numpy version - 1.16.1


### get data

In [2]:
data = keras.datasets.imdb

In [3]:
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=88_000)

In [4]:
train_data.shape, test_data.shape

((25000,), (25000,))

In [5]:
print(test_data[0], sep=',')

[1, 591, 202, 14, 31, 6, 717, 10, 10, 18142, 10698, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 7944, 100, 28, 1668, 14, 31, 23, 27, 7479, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 15387, 38, 32, 25, 7944, 451, 202, 14, 6, 717]


In [6]:
word_index = data.get_word_index()

In [7]:
word_index = {k:(v+3) for k, v in word_index.items()}

In [8]:
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

In [9]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [10]:
def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [11]:
decode_review(test_data[0])

"<START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss"

In [12]:
print(len(test_data[1]), len(test_data[2]))

260 603


In [13]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=250)

In [14]:
print(len(test_data[1]), len(test_data[2]))

250 250


### build the model

In [15]:
model = keras.Sequential()
model.add(keras.layers.Embedding(88000, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          1408000   
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 1,408,289
Trainable params: 1,408,289
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [18]:
x_val = train_data[:10_000]
x_train = train_data[10_000:]
y_val = train_labels[:10_000]
y_train = train_labels[10_000:]

### train the model

In [19]:
epochs = 40
batch_size=512

In [20]:
fit_model = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
                     validation_data=(x_val, y_val), verbose=1)

Train on 15000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [21]:
results = model.evaluate(test_data, test_labels)



In [22]:
results

[0.33821725542545317, 0.87068]

In [23]:
test_review = test_data[0]
predict = model.predict([test_review])
print("Review: {}".format(decode_review(test_review)))
print("Prediction: {}".format(predict[0]))
print("Actual: {}".format(test_labels[0]))

Review: <START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

In [24]:
test_review = test_data[10]
predict = model.predict([test_review])
print("Review: {}".format(decode_review(test_review)))
print("Prediction: {}".format(predict[10]))
print("Actual: {}".format(test_labels[10]))

Review: <START> inspired by hitchcock's strangers on a train concept of two men swapping murders in exchange for getting rid of the two people messing up their lives throw momma from the train is an original and very inventive comedy take on the idea it's a credit to danny devito that he both wrote and starred in this minor comedy gem br br anne ramsey is the mother who inspires the film's title and it's understandable why she gets under the skin of danny devito with her sharp tongue and relentlessly putting him down for any minor infraction billy crystal is the writer who's wife has stolen his book idea and is now being lionized as a great new author even appearing on the oprah show to in adulation he should be enjoying thus devito gets the idea of swapping murders to rid themselves of these nuisance factors br br of course everything and anything can happen when writer carl reiner lets his imagination roam with unending ideas for how the plot develops and it's amusing all the way thr

### Save/Load the model

In [25]:
name = "text_classification_model.h5"

In [26]:
model.save(name)

In [27]:
model = keras.models.load_model(name)

### Run the model on own review

In [28]:
def review_encode(s):
    encoded = [1]
    
    for word in s:
        if word.lower() in word_index:
            encoded.append(word_index[word.lower()])
        else:
            encoded.append(2)
    return encoded

In [31]:
with open("review.txt", encoding="utf-8") as f:
    for line in f.readlines():
        nline = line.replace(",","").replace(".","").replace("(","").replace(")","").replace(":","").replace("\"", "").strip(" ")
        encode = review_encode(nline)
        encode = keras.preprocessing.sequence.pad_sequences([encode], value=word_index["<PAD>"], padding="post", maxlen=250)
        predict = model.predict(encode)
        print(line)
        print(encode)
        print(predict[0])

Most of the time movies are anticipated like this they end up falling short, way short. Joker is the first time I was more than happy with the hype. Please ignore the complaints of "pernicious violence" as they are embarrassing to say the least. We haven't seen a comic movie this real before. If we ever "deserved" a better class of criminal - Phillips and Phoenix have delivered. This is dark, Joker IS dark and you will fall in love with the villain as you should. The bad guys are always more romantic anyway.
[[   2  590  963  963 3363    2    6    2 1148 1604 1983   13 1148    2
  1983 1604 1964   13  963    2  830 2023   13  590    2 1479  963    6
  2014    2  503  963 1209 1604 1479  963    2   13 1209    2 1992  963
     2  963 1964  963 1479    2 1095  963  590  963 1479 1964  963 1095
     2    6    2  503  963  830  830  963 1479    2 1148 2014    6  590
   590    2 1604 1209    2 1148 1479   13 1983   13 3363    6 2014    2
     2    2 1657 2023   13 2014 2014   13 1657  590   