# RNN - introduction

In this tutorial we learn how the reccurent neural networks work and are used in text classification.

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.image as mpimg
import argparse
import datetime
import os
import re

import numpy as np
import tensorflow as tf

from morpho_dataset import MorphoDataset

from keras.callbacks import EarlyStopping

import shutil

# Morpho dataset
- from Deep Learning course by Milan Straka
- https://github.com/ufal/npfl114/tree/past-1920/labs/07

## Define the network

In [None]:
class Network:
    def __init__(self, pdt, args):
        # Define a suitable model.

        num_tags = len(pdt.train.data[pdt.train.TAGS].words)
        num_words = len(pdt.train.data[pdt.train.FORMS].words)
        num_chars = len(pdt.train.data[pdt.train.FORMS].alphabet)
       
        
        # Implement a one-layer RNN network. The input
        # `word_ids` consists of a batch of sentences, each
        # a sequence of word indices. Padded words have index 0.

        # Embed input words with dimensionality `args.we_dim`,
        # using `mask_zero=True`.

        word_ids = tf.keras.Input(shape=(None,), dtype='int32')
        we = tf.keras.layers.Embedding(input_dim=num_words, output_dim=args.we_dim, mask_zero=True)(word_ids)

        # The RNN character-level embeddings utilize the input `charseqs`
        # containing a sequence of character indices for every input word.
        # Again, padded characters have index 0.

        charseqs = tf.keras.Input(shape=(None, None, ), dtype='int32')

        # Because cuDNN implementation of RNN does not allow empty sequences,
        # we need to consider only charseqs for valid words.
        valid_words = tf.where(word_ids != 0)
        cle = tf.gather_nd(charseqs, valid_words)

        # Embed the characters in `cle` using embeddings of size
        # `args.cle_dim`, masking zero indices. Then, pass the embedded characters
        # through a bidirectional GRU with dimension `args.cle_dim`, concatenating
        # results from forward and backward pass. Store the computed embeddings
        # in `cle` variable.

        cle = tf.keras.layers.Embedding(input_dim=num_chars, output_dim=args.cle_dim, mask_zero=True)(cle)

        forward_layer = tf.keras.layers.GRU(args.cle_dim)
        backward_layer = tf.keras.layers.GRU(args.cle_dim, go_backwards=True)

        cle = tf.keras.layers.Bidirectional(forward_layer, backward_layer=backward_layer)(cle)

        # Now we copy cle-s back to the original shape.
        cle = tf.scatter_nd(valid_words, cle, [tf.shape(charseqs)[0], tf.shape(charseqs)[1], cle.shape[-1]])

        # Concatenate the WE and CLE embeddings (in this order).
        # Use a `tf.keras.layers.Concatenate()` layer, which preserves masks
        # (contrary to raw methods like tf.concat).

        concat = tf.keras.layers.Concatenate()([we, cle])

        # Create specified `args.rnn_cell` RNN cell (LSTM, GRU) with
        # dimension `args.rnn_cell_dim` and apply it in a bidirectional way on
        # the embedded words, summing the outputs of forward and backward RNNs.

        if args.rnn_cell == 'LSTM':
            forward_layer = tf.keras.layers.LSTM(args.rnn_cell_dim, return_sequences=True)
            backward_layer = tf.keras.layers.LSTM(args.rnn_cell_dim, return_sequences=True, go_backwards=True)
        else:
            forward_layer = tf.keras.layers.GRU(args.rnn_cell_dim, return_sequences=True)
            backward_layer = tf.keras.layers.GRU(args.rnn_cell_dim, return_sequences=True, go_backwards=True)
        rnn_layer = tf.keras.layers.Bidirectional(forward_layer, backward_layer=backward_layer, merge_mode='sum')(concat)


        # Add a softmax classification layer into `num_tags` classes, storing
        # the outputs in `predictions`.

        predictions = tf.keras.layers.Dense(num_tags, activation=tf.nn.softmax)(rnn_layer)

        self.model = tf.keras.Model(inputs=[word_ids, charseqs], outputs=predictions)
        self.model.compile(optimizer=tf.optimizers.Adam(),
                           loss=tf.losses.SparseCategoricalCrossentropy(),
                           metrics=[tf.metrics.SparseCategoricalAccuracy(name="accuracy")])

        self._writer = tf.summary.create_file_writer(args.logdir, flush_millis=10 * 1000)



    def train_epoch(self, dataset, args):
        for batch in dataset.batches(args.batch_size):
            metrics = self.model.train_on_batch(
                [batch[dataset.FORMS].word_ids, batch[dataset.FORMS].charseqs],
                batch[dataset.TAGS].word_ids,
                reset_metrics=True)

            # Generate the summaries each 100 steps
            if self.model.optimizer.iterations % 100 == 0:
                tf.summary.experimental.set_step(self.model.optimizer.iterations)
                with self._writer.as_default():
                    for name, value in zip(self.model.metrics_names, metrics):
                        tf.summary.scalar("train/{}".format(name), value)



    def evaluate(self, dataset, dataset_name, args):
        # We assume that model metric are already resetted at this point.
        for batch in dataset.batches(args.batch_size):
            # Evaluate the given batch with `test_on_batch`, using the
            # same inputs as in training, but pass `reset_metrics=False` to
            # aggregate the metrics. Store the metrics of the last batch as `metrics`.
            metrics = self.model.test_on_batch(
                [batch[dataset.FORMS].word_ids, batch[dataset.FORMS].charseqs],
                batch[dataset.TAGS].word_ids,
                reset_metrics=False)
        self.model.reset_metrics()

        metrics = dict(zip(self.model.metrics_names, metrics))
        with self._writer.as_default():
            tf.summary.experimental.set_step(self.model.optimizer.iterations)
            for name, value in metrics.items():
                tf.summary.scalar("{}/{}".format(dataset_name, name), value)

        return metrics

## Define hyperparameters

In [None]:
parser = argparse.ArgumentParser()
# Define reasonable defaults and optionally more parameters
parser.add_argument("--batch_size", default=128, type=int, help="Batch size.")
parser.add_argument("--epochs", default=100, type=int, help="Number of epochs.")
parser.add_argument("--we_dim", default=2, type=int, help="Word embedding dimension.")
parser.add_argument("--rnn_cell", default="LSTM", type=str, help="RNN cell type.")
parser.add_argument("--rnn_cell_dim", default=16, type=int, help="RNN cell dimension.")
parser.add_argument("--cle_dim", default=16, type=int, help="CLE embedding dimension.")
parser.add_argument("--seed", default=42, type=int, help="Random seed.")
parser.add_argument("--threads", default=8, type=int, help="Maximum number of threads to use.")
parser.add_argument("--verbose", default=False, action="store_true", help="Verbose TF logging.")
args = parser.parse_args([] if "__file__" not in globals() else None)

In [None]:
# fix seed and set threads
np.random.seed(args.seed)
tf.random.set_seed(args.seed)
tf.config.threading.set_inter_op_parallelism_threads(args.threads)
tf.config.threading.set_intra_op_parallelism_threads(args.threads)

In [None]:
# Report only errors by default
if not args.verbose:
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [None]:
# load the dataset
morpho = MorphoDataset("czech_pdt")

In [None]:
# Create logdir name
args.logdir = os.path.join("logs", "{}-{}-{}".format(
    os.path.basename(globals().get("__file__", "notebook")),
    datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
    ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))
))

## Check the data

In [None]:
(morpho.train.data[0].words)

In [None]:
len(morpho.train.data[0].words)

In [None]:
(morpho.train.data[0].alphabet)

In [None]:
len(morpho.train.data[0].alphabet)

In [None]:
morpho.train.data[0].word_ids

In [None]:
len(morpho.train.data[0].word_ids)

In [None]:
# targets
morpho.train.data[morpho.train.TAGS].word_strings

In [None]:
morpho.train.data[morpho.train.TAGS].word_ids

In [None]:
morpho.train.data[morpho.train.LEMMAS].word_strings

In [None]:
dir(morpho.train.data[0])

In [None]:
morpho.train.data[0].words_map

In [None]:
morpho.train.data[0].alphabet_map

In [None]:
morpho.train.data[0].charseqs

## Train the model

In [None]:
network = Network(morpho, args)  

In [None]:
network.model.summary()

In [None]:
# TODO: To make the following line work you need to install graphviz (if you have not done so in one of the previous classes)
# 1) follow the instructions https://graphviz.gitlab.io/download/?fbclid=IwAR1V-lrRhho5rSfBVYXYISsighqRwOCOgMHLmL_DclkQrPtMXQaKj3mFcqs
# 2) this notebook has been tested with version 8.0.3
# 3) make sure you add it to the PATH variable (you are specifically asked during the installation) at least for local user

tf.keras.utils.plot_model(network.model, show_shapes=True, show_layer_names=True)

In [None]:
# training
for epoch in range(1):
    network.train_epoch(morpho.train, args)
    metrics = network.evaluate(morpho.dev, "dev", args)
    print(metrics)



In [None]:
# one can change the learning rate manually
network.model.compile(optimizer=tf.optimizers.Adam(learning_rate= 0.0001),
                       loss=tf.losses.SparseCategoricalCrossentropy(),
                       metrics=[tf.metrics.SparseCategoricalAccuracy(name="accuracy")]
)

In [None]:
# then perform more training
for epoch in range(1):
    network.train_epoch(morpho.train, args)
    metrics = network.evaluate(morpho.dev, "dev", args)
    print(metrics)


# Large Movie Review Dataset

<span style="color:red">**TO DO:** Large Movie Review Dataset</span>

- Download the data: https://ai.stanford.edu/%7Eamaas/data/sentiment/, unpack it into the Data folder
- Use RNN to predict the sentiment of the review
- use https://www.tensorflow.org/tutorials/keras/text_classification for inspiration

In [None]:
# remove unlabeled data

from pathlib import Path
remove_dir = Path('./../Data/aclImdb/train') / 'unsup'

shutil.rmtree(remove_dir)

In [None]:
# you can use this code to read the data
# set batch_size and seed
batch_size = 64
seed = 42

train_dir = Path('./../Data/aclImdb/train')

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)


In [None]:
# check a few examples
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(3):
        print("Review", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i])


In [None]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])
