To speed up training -

 1) replace standard LSTM to CuDNNLSTM. 
  Cudnn LSTM is said to be absurdly faster than the "regular" version.
 
 2) Increase batch_size : 4 to 32
 
 3) Increase learning rate : 0.0001 t0 0.01

# Load Data

In [1]:
#Cloud Object Storage setup    
#import ibm_boto3
#from ibm_botocore.client import Config
#import os
#import json
#import warnings
#import time

In [1]:
import numpy as np
import keras
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, recurrent, Embedding, Dropout
#from keras.layers.recurrent import LSTM
from keras.optimizers import Adam, RMSprop
from keras.models import Sequential
from keras.callbacks import Callback
import pickle
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras.utils import to_categorical

In [3]:
#By default, tensorflow pre-allocates nearly all of the available GPU memory, which is bad for a variety of use cases, 
#especially production and memory profiling.
## extra imports to set GPU options
import tensorflow as tf
from keras import backend as k
 
###################################
# TensorFlow wizardry
config = tf.ConfigProto()
 
# Don't pre-allocate memory; allocate as-needed
config.gpu_options.allow_growth = True
 
# Only allow a total of half the GPU memory to be allocated
config.gpu_options.per_process_gpu_memory_fraction = 0.8
 
# Create a session with the above options specified.
k.tensorflow_backend.set_session(tf.Session(config=config))
###################################
#Hopefully this might prevent ResourceExhaustedError that this sometimes throws.

<font color='red'> REMOVE CREDENTIALS BEFORE SHARING!!!! </font>

In [3]:
import pickle 
X_train = pickle.load(open('X_train.pickle', 'rb'))

In [4]:
#download_file_cos(cos,'y_train.pickle','y_train.pickle')
y_train = pickle.load(open('y_train.pickle', 'rb'))

In [7]:
X_train.shape

(221400, 150)

In [8]:
y_train.shape

(221400, 150)

In [9]:
X_train[0] #there are 221400 utterances and their correspoding responses.Each word has been converted to its corresponding index 
#(in accordance with (X/y)_word_to_idx.npy dictionaries)). The sequences has been padded with zeros to make all of them 150 words long
#'0' is actually the dictionary entry for EOL.The utterances have been reversed so as to avoid feeding 'zeroes' first to decoder network.

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [8]:
y_train[0]

array([   52,     2,   144,   657,   322,    32, 47842,    40,    47,
         110,    32,     1,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [5]:
X_word_to_idx = np.load('X_word_to_idx.npy')
X_idx_to_word = np.load('X_idx_to_word.npy')
y_word_to_idx = np.load('y_word_to_idx.npy')
y_idx_to_word = np.load('y_idx_to_word.npy')
parameters    = np.load('parameters.npy')

In [6]:
X_vocab_size = len(X_word_to_idx.item())
y_vocab_size = len(y_word_to_idx.item())

In [16]:
X_vocab_size

47745

In [17]:
y_vocab_size

47973

In [8]:
print(parameters.item())

{' y_max_len': 150, 'VOCAB_SIZE': 50000, 'MAX_LEN': 50, 'X_max_len': 150}


In [7]:
X_max_len =  parameters.item()['X_max_len']

In [8]:
y_max_len =  parameters.item()[' y_max_len']

In [12]:
print(X_max_len, y_max_len)

150 150


In [13]:
samples = X_train.shape[0]

In [9]:
#This implementation generates the dataset on multiple cores in real time and feed it right away to the deep learning model.
#Useful for Large datasets ref : https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
#Sequence are a safer way to do multiprocessing. 
#This structure guarantees that the network will only train once on each sample per epoch which is not the case with generators.
#The method __getitem__ should return a complete batch
class BatchGenerator(keras.utils.Sequence):
    def __init__(self, X, y, X_max_len, y_max_len, batch_size, vocabulary, shuffle=True):
        'Initialization'
        self.X = X
        self.y = y
        self.X_max_len = X_max_len
        self.y_max_len = y_max_len
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.X))
        #print(int(np.ceil(len(self.X)/self.batch_size)))

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(len(self.X)/self.batch_size))
    

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        #print("index ",index)
        #print("get item",indexes)
        # Generate data
        X_batch, y_batch = self.__data_generation(indexes)
        #print(X_batch.shape)
        #print(y_batch.shape)
        #self.display_batch(X_batch,y_batch)
        return X_batch, y_batch

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.X))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        # Initialization
        X_batch = np.zeros((self.batch_size, self.X_max_len))
        y_batch = np.zeros((self.batch_size, self.y_max_len, self.vocabulary))

        # Generate data
        for i, index in enumerate(indexes):
            # Utterance
            X_batch[i,:] = self.X[index]

            # Response
            y_batch[i,:,:] = to_categorical(self.y[index], num_classes=self.vocabulary)

        return X_batch, y_batch
    
    def display_batch(self, X_batch, y_batch):
        for U_sample,R_sample in zip(X_batch, y_batch):
            #print(U_sample)
            #print(R_sample)
            R_sample = np.argmax(R_sample, axis=1)
            print("U : "+' '.join([X_idx_to_word[int(i)] for i in U_sample if i > 0][::-1]))
            print("R : "+' '.join([y_idx_to_word[int(i)] for i in R_sample if i > 0]))     

# Baseline Model : A simple encoder decoder LSTM architecture trained with Cornell Movie Database alone

In [10]:
learning_rate = 0.01
hidden_size = 500
batch_size = 32
num_epochs = 10 

In [91]:
X_train[:10,]

array([[    0,     0,     0, ...,   109,    22,    50],
       [    0,     0,     0, ...,   142,     2,    57],
       [    0,     0,     0, ..., 10158,     3,    26],
       ...,
       [    0,     0,     0, ...,     0,     0,    60],
       [    0,     0,     0, ...,    64,  2850,  9219],
       [    0,     0,     0, ...,   128,    42,  2234]])

In [102]:
X_train[:10,].shape

(10, 150)

In [11]:
train_data_generator = BatchGenerator(X_train[:10,], y_train[:10,], X_max_len, y_max_len, batch_size, y_vocab_size)

In [12]:
from keras.layers import  CuDNNLSTM 
test_model = Sequential()

# Creating encoder network
#mask_zero s set to True because Masking is not supported for CuDNN RNNs.
#For each timestep in the input tensor (dimension #1 in the tensor), 
#if all values in the input tensor at that timestep are equal to mask_value, 
#then the timestep will be masked (skipped) in all downstream layers (as long as they support masking).
test_model.add(Embedding(X_vocab_size, 50, input_length=X_max_len, mask_zero=False)) #Dimension of the dense embedding :1000
test_model.add(CuDNNLSTM(hidden_size))
test_model.add(RepeatVector(y_max_len))
# Creating decoder network
test_model.add(CuDNNLSTM(hidden_size, return_sequences=True))
test_model.add(Dropout(0.5))
test_model.add(TimeDistributed(Dense(y_vocab_size)))
test_model.add(Activation('softmax'))
test_model.compile(loss='categorical_crossentropy',
            optimizer=RMSprop(lr=learning_rate),
            metrics=['accuracy'])

In [14]:
test_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 50)           2387250   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 500)               1104000   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 150, 500)          0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 150, 500)          2004000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 500)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 150, 47973)        24034473  
_________________________________________________________________
activation_1 (Activation)    (None, 150, 47973)        0         
Total para

In [21]:
#CuDNN uses two bias terms, so the number of bias weights is doubled. 
#lstm_20 (LSTM)               (None, 500)               1102000  (refer Models_dry_run_1050ti.ipynb)
#cu_dnnlstm_2 (CuDNNLSTM)     (None, 500)               1104000 
#lstm_21 (LSTM)               (None, 150, 500)          2002000  (refer Models_dry_run_1050ti.ipynb) 
#cu_dnnlstm_3 (CuDNNLSTM)     (None, 150, 500)          2004000   

In [15]:
#InternalError (see above for traceback): Failed to call ThenRnnForward -restart kernal...this happens when you run the create model 
#part more than once
#Multiprocessing=True will not work on Windows
#Since Windows has no fork, the multiprocessing module starts a new Python process and imports the calling module.
#Without __name__ == __main__ idiom the new process 
#(started by keras' fit_generator()) will call your all code again and will start another process, 
#and so on ad infinitum (until OOM).
history = test_model.fit_generator(generator=train_data_generator,epochs=num_epochs,
                    use_multiprocessing=False)

Epoch 1/10


ResourceExhaustedError: OOM when allocating tensor with shape[32,150,47973]
	 [[Node: activation_1/sub = Sub[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](time_distributed_1/Reshape_1, activation_1/Max)]]
	 [[Node: metrics/acc/Mean/_109 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1491_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'activation_1/sub', defined at:
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\vinit\AppData\Roaming\Python\Python36\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\vinit\AppData\Roaming\Python\Python36\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\vinit\AppData\Roaming\Python\Python36\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\vinit\AppData\Roaming\Python\Python36\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\vinit\AppData\Roaming\Python\Python36\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\vinit\AppData\Roaming\Python\Python36\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\vinit\AppData\Roaming\Python\Python36\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-55e3e1d64dde>", line 16, in <module>
    test_model.add(Activation('softmax'))
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\keras\models.py", line 492, in add
    output_tensor = layer(self.outputs[0])
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\keras\engine\topology.py", line 617, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\keras\layers\core.py", line 303, in call
    return self.activation(inputs)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\keras\activations.py", line 31, in softmax
    e = K.exp(x - K.max(x, axis=axis, keepdims=True))
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\math_ops.py", line 894, in binary_op_wrapper
    return func(x, y, name=name)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 4635, in _sub
    "Sub", x=x, y=y, name=name)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op
    op_def=op_def)
  File "C:\Users\vinit\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[32,150,47973]
	 [[Node: activation_1/sub = Sub[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](time_distributed_1/Reshape_1, activation_1/Max)]]
	 [[Node: metrics/acc/Mean/_109 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1491_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [None]:
#this is throwing OOM error on 1090Ti (4GB), lets try on K80 in IBM Watson studio