# Big Data Content Analytics - AUEB

## Introduction to Recurrent Neural Networks

* Lab Assistant: George Perakis
* Email: gperakis[at]aeub.gr | perakisgeorgios[at]gmail.com

<img src="https://i.stack.imgur.com/aTDpS.png">

<img src="http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-f.png">

<img src="http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-i.png">

<img src="http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-C.png">

<img src="http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-o.png">

<img src="https://gblobscdn.gitbook.com/assets%2F-LvMRntv-nKvtl7WOpCz%2F-LvMRp9FltcwEeVxPYFs%2F-LvMRquVdf276Mkd6Lkb%2FRNN_Unrolling.png?alt=media">

### Import Modules

In [None]:
from __future__ import print_function

import numpy as np

np.random.seed(1337)  # for reproducibility

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dense, Embedding, LSTM, Input, Concatenate, Bidirectional, concatenate
from tensorflow.python.keras.datasets import imdb

### Build Models functions

In [None]:
def build_model(max_len: int,
                max_feats: int,
                emb_dimensions: int,
                n_outputs: int = 1):
    """

    :param max_len:
    :param max_feats:
    :param emb_dimensions:
    :param n_outputs:
    :return:
    """
    # this is the placeholder tensor for the input sequences
    sequence = Input(shape=(max_len,), dtype='int32')

    # this embedding layer will transform the sequences of integers into vectors of size 128
    emb_layer = Embedding(max_feats, emb_dimensions, input_length=max_len)

    embedded = emb_layer(sequence)

    # apply forwards LSTM
    forwards = LSTM(64)(embedded)

    # apply backwards LSTM
    backwards = LSTM(64, go_backwards=True)(embedded)

    # concatenate the outputs of the 2 LSTMs
    merged = concatenate([forwards, backwards], axis=-1)

    # after_dp = Dropout(0.5)(merged)
    # output = Dense(1, activation='sigmoid')(after_dp)

    if n_outputs == 1:
        output = Dense(n_outputs,
                       activation='sigmoid')(merged)

        model = Model(inputs=[sequence],
                      outputs=[output])

        # try using different optimizers and different optimizer configs
        model.compile('adam',
                      'binary_crossentropy',
                      metrics=['accuracy'])

    else:
        output = Dense(n_outputs, activation='softmax')(merged)

        model = Model(inputs=[sequence], outputs=[output])

        # try using different optimizers and different optimizer configs
        model.compile('adam',
                      'categorical_crossentropy',
                      metrics=['accuracy'])

    print(model.summary())
    return model

In [None]:
def build_model_2(max_len: int,
                  max_feats: int,
                  emb_dimensions: int,
                  n_outputs: int = 1):
    """

    :param max_len:
    :param max_feats:
    :param emb_dimensions:
    :param n_outputs:
    :return:
    """
    # this is the placeholder tensor for the input sequences
    sequence = Input(shape=(max_len,), dtype='int32')

    # this embedding layer will transform the sequences of integers into 
    # vectors of size 128
    emb_layer = Embedding(max_feats, emb_dimensions, input_length=max_len)

    embedded = emb_layer(sequence)

    lstm = Bidirectional(LSTM(64, return_sequences=False))(embedded)
    # lstm = Bidirectional(LSTM(64))(embedded)

    # after_dp = Dropout(0.5)(merged)
    # output = Dense(1, activation='sigmoid')(after_dp)

    if n_outputs == 1:
        output = Dense(n_outputs, activation='sigmoid')(lstm)

        model = Model(inputs=[sequence], outputs=[output])

        # try using different optimizers and different optimizer configs
        model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

    else:
        output = Dense(n_outputs, activation='softmax')(lstm)

        model = Model(inputs=[sequence], outputs=[output])

        # try using different optimizers and different optimizer configs
        model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

    print(model.summary())
    return model

### Load Data

In [None]:
# Dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative).
# Reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers).
max_features = 20000

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])


25000 train sequences
25000 test sequences


  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [None]:
print(x_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


### Set Hyper-parameters

In [None]:
maxlen = 100  # cut texts after this number of words (among top max_features most common words)

batch_size = 128
emb_dim = 100

print(f'Max-Length: {maxlen}')

Max-Length: 100


In [None]:
print("Pad sequences (samples x time)")

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print(f'X_train shape: {x_train.shape}')
print(f'X_test shape: {x_test.shape}')

Pad sequences (samples x time)
X_train shape: (25000, 100)
X_test shape: (25000, 100)


In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
y_train

array([1, 0, 0, ..., 0, 1, 0])

### Build Model

In [None]:
rnn_model = build_model_2(max_len=maxlen, max_feats=max_features, emb_dimensions=emb_dim)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 100)          2000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               84480     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 2,084,609
Trainable params: 2,084,609
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
print('Train...')

train_samples = 20_000

rnn_model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=5,
    validation_split=0.20
)


# Train a Bidirectional LSTM on the IMDB sentiment classification task.
# Accuracy after 5 epochs on CPU: ~0.83 
# Time per epoch on GPU (Colab): ~120sec.


Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f390b965490>

### Model Evaluation

In [None]:
score = rnn_model.evaluate(
    x_test,  # features
    y_test,  # labels
    batch_size=batch_size,  # batch size
    verbose=1  # the most extended verbose
)

print('\nTest crossentropy:', score[0])
print('\nTest accuracy:', score[1])


Test crossentropy: 0.682769775390625

Test accuracy: 0.8162400126457214
