<a href="https://colab.research.google.com/github/suvaline/DeepLearningComparison/blob/master/gru_and_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
!pip install numpy==1.16.1
import numpy as np
from scipy.spatial.distance import cdist
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, Dropout
from tensorflow.python.keras.optimizers import Adam, Adadelta
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import keras
from keras.utils import to_categorical
from keras.datasets import imdb, reuters



Using TensorFlow backend.


In [0]:
epochs = 10
batch_size = 128
num_words = 10000
dataset = 'reuters'
if dataset == 'imdb':
  (x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=num_words,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)
if dataset == 'reuters':
  (x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=num_words,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)

In [3]:
print("Train-set size: ", len(x_train))
print("Test-set size:  ", len(x_test))

Train-set size:  8982
Test-set size:   2246


In [4]:
if dataset == 'imdb':
  num_tokens = [len(tokens) for tokens in x_train + x_test]
if dataset == 'reuters':
  num_tokens = [len(tokens) for tokens in x_train]
  
  train_binary = to_categorical(y_train)
  val_binary = to_categorical(y_test)
num_tokens = np.array(num_tokens)
print(np.mean(num_tokens))
print(np.max(num_tokens))

145.5398574927633
2376


In [5]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

437

**Padding and Truncating Data**

Prepadding data ensures that the padding has no influence on the networks state.

In [6]:
print(x_train[1])
print(np.sum(num_tokens < max_tokens) / len(num_tokens))
pad = 'post'
x_train_pad = pad_sequences(x_train, maxlen=max_tokens, padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test, maxlen=max_tokens, padding=pad, truncating=pad)
print(x_train_pad[1])

[1, 3267, 699, 3434, 2295, 56, 2, 7511, 9, 56, 3906, 1073, 81, 5, 1198, 57, 366, 737, 132, 20, 4093, 7, 2, 49, 2295, 2, 1037, 3267, 699, 3434, 8, 7, 10, 241, 16, 855, 129, 231, 783, 5, 4, 587, 2295, 2, 2, 775, 7, 48, 34, 191, 44, 35, 1795, 505, 17, 12]
0.9441104431084391
[   1 3267  699 3434 2295   56    2 7511    9   56 3906 1073   81    5
 1198   57  366  737  132   20 4093    7    2   49 2295    2 1037 3267
  699 3434    8    7   10  241   16  855  129  231  783    5    4  587
 2295    2    2  775    7   48   34  191   44   35 1795  505   17   12
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0   

In [0]:

embedding_size = 8
def gru(num_words, embedding_size, max_tokens):
  model = Sequential()

  model.add(Embedding(input_dim=num_words,
                      output_dim=embedding_size,
                      input_length=max_tokens,
                      name='layer_embedding'))

  model.add(tf.keras.layers.CuDNNGRU(units=16, return_sequences=True))

  model.add(tf.keras.layers.CuDNNGRU(units=8, return_sequences=True))

  model.add(tf.keras.layers.CuDNNGRU(units=4))

  model.add(Dense(1, activation='sigmoid'))

  optimizer = Adam(lr=1e-3)

  model.compile(loss='binary_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])
  return model
def gru2(num_words, embedding_size, max_tokens, units, depth):
  model = Sequential()

  model.add(Embedding(input_dim=num_words,
                      output_dim=embedding_size,
                      input_length=max_tokens,
                      name='layer_embedding'))

  i = 0
  while i < depth:
      model.add(tf.keras.layers.CuDNNGRU(units=units, return_sequences=True))
      i += 1

  model.add(tf.keras.layers.CuDNNGRU(units=units))


  
  if dataset == 'imdb':
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
  if dataset == 'reuters':
    model.add(Dense(46, activation='softmax'))
    optimizer = Adadelta()
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
  return model


In [0]:

embedding_size = 8
def lstm(num_words, embedding_size, max_tokens):
  model = Sequential()

  model.add(Embedding(input_dim=num_words,
                      output_dim=embedding_size,
                      input_length=max_tokens,
                      name='layer_embedding'))

  model.add(tf.keras.layers.CuDNNLSTM(units=15, return_sequences=True))

  model.add(tf.keras.layers.CuDNNLSTM(units=8, return_sequences=True))

  model.add(tf.keras.layers.CuDNNLSTM(units=4))

  model.add(Dense(1, activation='sigmoid'))

  optimizer = Adam(lr=1e-3)

  model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
  return model
def lstm2(num_words, embedding_size, max_tokens, units, depth):
  model = Sequential()

  model.add(Embedding(input_dim=num_words,
                      output_dim=embedding_size,
                      input_length=max_tokens,
                      name='layer_embedding'))
  i = 0
  while i < depth:
      model.add(tf.keras.layers.CuDNNLSTM(units=units, return_sequences=True))
      i += 1

  model.add(tf.keras.layers.CuDNNLSTM(units=units))

  if dataset == 'imdb':
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
  if dataset == 'reuters':
    model.add(Dense(46, activation='softmax'))
    optimizer = Adadelta()
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
  return model


In [0]:
def create_2d_array(x,y):
  a = [0] * x
  for i in range(x):
    a[i] = [0] * y
  return a
def plot(history):
  plt.figure(figsize=[8,6])
  plt.plot(history.history['loss'],'r',linewidth=3.0)
  plt.plot(history.history['val_loss'],'b',linewidth=3.0)
  plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
  plt.xlabel('Epochs ',fontsize=16)
  plt.ylabel('Loss',fontsize=16)
  plt.title('Loss Curves',fontsize=16)
  plt.show()
def plot(history):
  plt.figure(figsize=[8,6])
  plt.plot(history.history['loss'],'r',linewidth=3.0)
  plt.plot(history.history['val_loss'],'b',linewidth=3.0)
  plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
  plt.xlabel('Epochs ',fontsize=16)
  plt.ylabel('Loss',fontsize=16)
  plt.title('Loss Curves',fontsize=16)
  plt.show()
 
#Plot the Accuracy Curves
  plt.figure(figsize=[8,6])
  plt.plot(history.history['acc'],'r',linewidth=3.0)
  plt.plot(history.history['val_acc'],'b',linewidth=3.0)
  plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
  plt.xlabel('Epochs ',fontsize=16)
  plt.ylabel('Accuracy',fontsize=16)
  plt.title('Accuracy Curves',fontsize=16)
  plt.show()

def create_heatmaps(rows, columns, array, create_max_heatmap = 'false'):
  print("mean loss heatmap:")
  mean_array = np.mean(array, axis=2)
  ax = sns.heatmap(mean_array, linewidth=0.5,vmin=0, vmax=1, annot=True)
  plt.show()
  last_loss_array = create_2d_array(rows, columns)
  for i in range(rows):
    for j in range(columns):
      last_loss_array[i][j] = array[i][j][epochs-1]
  
  print("last result heatmap:")
  ax = sns.heatmap(last_loss_array, linewidth=0.5,vmin=0, vmax=1, annot=True)
  plt.show()
  if create_max_heatmap == 'true':
    print("max heatmap :")
    max_array = np.amax(array, axis=2)
    ax = sns.heatmap(max_array, linewidth=0.5,vmin=0, vmax=1, annot=True)
    plt.show()

def show_heatmaps(arrays, depth, width):
  loss_array = arrays[0]
  val_loss_array = arrays[1]
  acc_array = arrays[2]
  val_acc_array = arrays[3]
  print("loss heatmaps :")
  create_heatmaps(depth, width, loss_array)
  print("validation loss heatmaps :")
  create_heatmaps(depth, width, val_loss_array)
  print("accuracy heatmaps :")
  create_heatmaps(depth, width, acc_array, create_max_heatmap = 'true')
  print("validation accuracy heatmaps :")
  create_heatmaps(depth, width, val_acc_array, create_max_heatmap = 'true')
def generate_train_and_plot(depth, multiplier, arrays, max_depth, width_multiplier, network_type):
  original_multiplier = multiplier
  loss_array = arrays[0]
  val_loss_array = arrays[1]
  acc_array = arrays[2]
  val_acc_array = arrays[3]
  while depth < max_depth:
    multiplier = original_multiplier
    while multiplier < width_multiplier:
      if network_type == 'GRU':
        model = gru2(num_words, embedding_size, max_tokens,  5 + 20 * multiplier, depth)
      if network_type == 'LSTM':
        model = lstm2(num_words, embedding_size, max_tokens,  5 + 20 * multiplier, depth)
      model.summary()
      if dataset == 'imdb':
        history = model.fit(x_train_pad, y_train, validation_split=0.05, epochs=epochs, batch_size=batch_size)
      if dataset == 'reuters':
        history = model.fit(x_train_pad, train_binary, validation_data=(x_test_pad, val_binary), epochs=epochs, batch_size=batch_size)
      tf.keras.backend.clear_session()
      print("depth :", depth, "multiplier", multiplier)
      loss_array[depth][multiplier] = history.history['loss']
      print("loss_array =", loss_array)
      val_loss_array[depth][multiplier] = history.history['val_loss']
      print("val_loss_array =", val_loss_array)
      acc_array[depth][multiplier] = history.history['acc']
      print("accuracy_array =", acc_array)
      val_acc_array[depth][multiplier] = history.history['val_acc']
      print("val_accuracy_array =", val_acc_array)
      multiplier += 1
    depth += 1
    print("depth : ", depth)
  arrays = [loss_array, val_loss_array, acc_array, val_acc_array]
  return arrays
  

In [10]:
#max_depth describes the layer count , width_multiplier describes the amount of steps for the network width 
max_depth = 3
width_multiplier = 5
epochs = 10
tf.keras.backend.clear_session()

arrays = [create_2d_array(max_depth, width_multiplier), create_2d_array(max_depth, width_multiplier), create_2d_array(max_depth, width_multiplier), create_2d_array(max_depth, width_multiplier)]
arrays = generate_train_and_plot(0, 0, arrays, max_depth, width_multiplier, 'LSTM')

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 437, 8)            80000     
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       (None, 5)                 300       
_________________________________________________________________
dense (Dense)                (None, 46)                276       
Total params: 80,576
Trainable params: 80,576
Non-trainable params: 0
_________________________________________________________________
Train on 8982 samples, validate on 2246 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: ignored