In [3]:
import  os
import sys
import utils
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
# utils.imdbdataset.maybe_download_and_extract()

In [4]:
x_train_text, y_train = utils.imdbdataset.load_data(train=True)
x_test_text, y_test = utils.imdbdataset.load_data(train=False)
print("Train-set Size: ", len(x_train_text))
print("Test-set Size: ", len(x_test_text))
data_set = x_test_text + x_train_text

Train-set Size:  25000
Test-set Size:  25000


In [5]:
print(x_train_text[1])
print(y_train[1])
y_train = np.array(y_train)
y_test = np.array(y_test)

Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV's "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina's pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily police detective. The movie is nearly a cross-pollination of "Rosemary's Baby" and "The Exorcist"--but what a combination! Based on the best-seller by Jeffrey Konvitz, "The Sentinel" is entertainingly spooky, full of shocks brought off well by director Michael Winner, who mounts a thoughtfully downbeat en

In [6]:
'''Tokenizer'''
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)  
tokenizer.fit_on_texts(data_set)

In [7]:
print(len(tokenizer.word_index))

124252


In [8]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)
print(x_train_text[1])
print(np.array(x_train_tokens[1]))
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV's "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina's pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily police detective. The movie is nearly a cross-pollination of "Rosemary's Baby" and "The Exorcist"--but what a combination! Based on the best-seller by Jeffrey Konvitz, "The Sentinel" is entertainingly spooky, full of shocks brought off well by director Michael Winner, who mounts a thoughtfully downbeat en

In [9]:
'''Padding and Truncating data'''
# we here make a compromise  and use sequence-length that covers most of the data
# and we will then truncate longer sequences and pad shorter sequences.
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)
# average number of tokens in a sequence 
print("average number of tokens in a sequence ", np.mean(num_tokens))
# max  number of tokens
print(np.max(num_tokens))

average number of tokens in a sequence  221.27716
2209


In [10]:
# the max number of tokens we will allow is set to the average plus 2 standard deviations.
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print(max_tokens)

544


In [11]:
# This covers about 95% of the data-set.
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.94532

In [12]:
# padding or truncating the sequences
pad = 'pre' # adding zeros first, if pad='post, zeros are added at the end
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
print(x_train_pad.shape)
print(x_test_pad.shape)
print("Example", np.array(x_train_tokens[1]), x_train_pad[1])


(25000, 544)
(25000, 544)
Example [1153  182   17 1066   16  815 1458   18 2602   31 7960  305    4 7932
 1211   14    3  180   18  672 8327 2199   16    3 1862   35    6    5
  969   15   40 3143   31    1    5  603    1  135   16 7960   23   52
   69 1819    1 1245  207    6  399 8174    6 1313   14 4998   18   50
 7960 1121   82    3  978 5112 5679 8831   31    3 2000 1982   20    1
  342 1866  177   62  375 6226    1 5038  585    3 8728 3546 8234    2
 8464  374 7993 2080 5427   23    3 9806  169    2    6   78  245   14
    3  572 1362    1   17    6  800    3 1633    4 8796  977    2    1
 5035   18   48    3 2174  441   20    1  116   31 4427    1 8926    6
 3750  363    4 7055  831  122   69   31  164  498 2302   35    3 9317
  272   16 2788  307  230   36] [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0 

In [13]:
'''Inverse Mapping'''
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text
    
print("Example for reverse mapping", x_train_text[1],".......recreated this text......",
       tokens_to_string(x_train_tokens[1]))

Example for reverse mapping Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV's "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina's pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily police detective. The movie is nearly a cross-pollination of "Rosemary's Baby" and "The Exorcist"--but what a combination! Based on the best-seller by Jeffrey Konvitz, "The Sentinel" is entertainingly spooky, full of shocks brought off well by director Michael Winner, who mount

In [14]:
'''Creating Model'''
model = Sequential()
embedding_size = 8
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

# Because we will add a second GRU after this one, we need to return sequences of data because the next GRU expects sequences as its input
model.add(GRU(units=16, return_sequences=True))
#  second GRU with 8 output units
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
# fully-connected / dense layer which computes a value between 0.0 and 1.0 that will be used as the classification output.
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model.summary() 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru (GRU)                    (None, 544, 16)           1248      
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 8)            624       
_________________________________________________________________
gru_2 (GRU)                  (None, 4)                 168       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 82,045
Trainable params: 82,045
Non-trainable params: 0
_________________________________________________________________


In [None]:
'''Training RNN'''
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3


In [1]:
model.summary() 

NameError: name 'model' is not defined