In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.optimizers import Adam

In [2]:
imdb, info = tfds.load("imdb_reviews", with_info = True, as_supervised=True)



In [3]:
train_data, test_data = imdb['train'], imdb['test']

train_sentences = []
train_labels = []
test_sentences = []
test_labels = []

for data, labels in imdb['train']:
    train_sentences.append(data.numpy().decode('utf8'))
    train_labels.append(labels.numpy())

for data, labels in imdb['test']:
    test_sentences.append(data.numpy().decode('utf8'))
    test_labels.append(labels.numpy())

In [4]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [81]:
print('Train set: ', len(train_sentences))
print('Train set: ', len(test_sentences))

Train set:  25000
Train set:  25000


In [5]:
vocab_size = 10000
embedding_dim = 16
maxlen = 120
truc_type = 'post'
oov_tok = '<OOV>'

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [6]:
pad_train_sequences = pad_sequences(train_sequences, maxlen = maxlen,
                                   truncating = truc_type)
pad_test_sequences = pad_sequences(test_sequences, maxlen = maxlen,
                                  truncating = truc_type)

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, 
                              output_dim = embedding_dim,
                             input_length = maxlen),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units = 10, activation = 'relu', 
                         input_shape=(maxlen,)),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])
model.summary()

optimizers = Adam()
num_epochs = 30
model.compile(optimizer = optimizers, loss='binary_crossentropy',
             metrics=['accuracy'])
model.fit(x = pad_train_sequences, y = train_labels, epochs=num_epochs, verbose=2)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_3 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 10)                19210     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 11        
Total params: 179,221
Trainable params: 179,221
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples
Epoch 1/30
25000/25000 - 2s - loss: 0.4917 - accuracy: 0.7393
Epoch 2/30
25000/25000 - 1s - loss: 0.2293 - accuracy: 0.9136
Epoch 3/30
25000/25000 - 1s - loss: 0.0716 - accuracy: 0.9820
Epoch 4/30
25000/25000 - 1s - loss: 0

<tensorflow.python.keras.callbacks.History at 0x1c213bda0f0>

In [22]:
reverse_word_index = dict([(value, key) for (key, value) in tokenizer.word_index.items()])

def decode_sequences_to_sentences(sequences):
    print(' '.join([reverse_word_index.get(seq, '?') for seq in sequences]))

In [27]:
print(test_sequences[0])
print(decode_sequences_to_sentences(test_sequences[0]))

[205, 294, 2, 18, 163, 448, 209, 3, 49, 11, 179, 6, 133, 42, 10, 7, 2, 62, 153, 13, 91, 12, 18, 310, 4216, 14, 2, 1, 1, 1047, 237, 425, 162, 19, 13, 22, 58, 146, 708, 2179, 770, 1, 1, 3, 47, 5, 132, 1060, 1430, 31, 209, 9, 2, 18, 1, 7, 405, 53, 1018, 205, 108, 4, 169, 2632, 27, 91, 3, 75, 141, 58, 149, 12, 7, 241, 5, 187, 18, 238, 398, 64, 50, 290, 9, 210, 100, 3, 80, 9, 3279, 97, 11, 2133, 89, 5, 23, 30, 217, 1, 1, 2, 18, 3, 246, 199, 146, 105, 60, 138, 16, 33, 460, 30, 144, 2145, 133, 7, 23, 126, 104, 44, 16, 2, 160, 493, 580, 44, 37, 1, 93, 488, 65, 399, 2, 50, 155, 1, 74, 117]
i've watched the movie actually several times and what i want to say about it is the only thing that made this movie high rank was the <OOV> <OOV> incredible performance absolutely nothing but that not even those silly model named <OOV> <OOV> and some of these popular names at times in the movie <OOV> is definitely very talented i've seen a few jobs he made and been through even though this is kind of horror 

In [39]:
embed_layer = model.layers[0]
embed_layer_weights = embed_layer.get_weights()[0]
print(embed_layer_weights)
print(embed_layer_weights.shape)

[[-0.01452139  0.02612004  0.02090387 ...  0.02429333 -0.04124185
   0.04462695]
 [-0.05363107  0.02032807  0.06828014 ...  0.08385263 -0.03166177
   0.02666487]
 [-0.04383694  0.03493666  0.05708929 ...  0.02735372  0.00864338
   0.08437771]
 ...
 [ 0.00169956  0.10563534  0.12230495 ...  0.03065151  0.04660634
  -0.08699423]
 [-0.07812002 -0.04641258 -0.01522218 ...  0.11056425 -0.09137205
   0.03208904]
 [ 0.05956979  0.11263913  0.21002641 ...  0.05550042  0.06000242
  -0.06239458]]
(10000, 16)


In [49]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for i in range(1, vocab_size):
    word = reverse_word_index.get(i)
    weight = embed_layer_weights[i]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in weight]) + "\n")

out_m.close()
out_v.close()

In [77]:
# Prediction
sentence = ['I hate this film so much. It is such a bad film ever']
sequence = tokenizer.texts_to_sequences(sentence)
pad_sequence =  pad_sequences(sequence, maxlen = maxlen,
                                   truncating = truc_type)
print(pad_sequence)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  11
  781  12  20  36  74  10   7 139   4  76  20 124]]


In [78]:
print(model.predict(pad_sequence))
if model.predict(pad_sequence)[0][0] > 0.5:
    print('Good movie')
else:
    print('Bad movie')

[[0.00144017]]
Bad movie


In [65]:
test_labels[9]

0