## Setup

In [2]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()




In [3]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

## Setup Pipline

In [4]:
dataset, info = tfds.load('imdb_reviews', with_info=True,as_supervised=True)

train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\yasme\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...[0m
[1mDataset imdb_reviews downloaded and prepared to C:\Users\yasme\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m


(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [5]:
dataset, info = tfds.load('imdb_reviews', with_info=True,as_supervised=True)

train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [6]:
for example, label in train_dataset.take(1):
    print('text: ', example.numpy())
    print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [7]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [8]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [9]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

 b'This musical was not quite what I expected, foremost being there weren\'t many scenes between Brando and Sinatra. As it was based on a Damon Runyon story, I expected irony and surprise, of which there was one really good one - when we find that Sinatra\'s gang has used the Salvation Army office for their crap game while Brando was in Havana with Simmons. If course it comes at the right moment too, when Brando brings her back. I really didn\'t expect much from Brando as a singer, but he surprised me. He wasn\'t great but he was just fine in the role. His big number in the sewer, however, with the rest of Sinatra\'s boys was the only place I felt Brando\'s voice was weak. He just didn\'t have the power the grand climax demanded. Overall I found the scenes between Brando and Simmons to be filled with electricity, something I didn\'t think would happen when we first see Simmons by herself, and later when we\'re introduced to Brando in the restaurant with Sinatra trying to pull a fast on

# The Encoder


The encoder is the first layer in a model that is used to convert text into a sequence of token indices.
In natural language processing (NLP), text is typically represented as a sequence of tokens, where each token represents a word or a subword unit.
Token indices are numerical representations of these tokens, which are used by the model to process and understand the text.
The process of converting text into token indices is known as tokenization.
Tokenization is an important step in NLP tasks as it allows the model to work with discrete units of text instead of treating the entire text as a continuous sequence.
The encoder takes the input text and applies tokenization to convert it into a sequence of token indices.
Each token index represents a specific token in the text and is typically a unique integer value.
These token indices are then used as input to the subsequent layers of the model for further processing and analysis.
The encoder can use various tokenization techniques, such as word-based tokenization or subword-based tokenization, depending on the specific requirements of the task.
The choice of tokenization technique can impact the performance of the model, as it determines how the text is divided into tokens and represented as token indices.
Overall, the encoder plays a crucial role in NLP models by converting text into a format that can be effectively processed and analyzed by the subsequent layers.

**After the encoder is an embedding layer**: 
- An embedding layer is a component in a neural network that is used to convert discrete word indices into continuous vector representations.
- In natural language processing tasks, words are typically represented as one-hot vectors, where each word is represented by a vector with all zeros except for a single one at the index corresponding to the word's position in the vocabulary.
- However, one-hot vectors are not suitable for input into neural networks because they are high-dimensional and sparse, meaning that most of the elements in the vector are zeros.
- An embedding layer solves this problem by mapping each word index to a dense vector of continuous values, also known as word embeddings.
- These word embeddings capture semantic relationships between words, allowing the neural network to better understand the meaning of the words in the input sequence.

**An embedding layer stores one vector per word**:
- In the embedding layer, each word in the vocabulary is associated with a unique vector representation.
- These vectors are typically of fixed length and have continuous values.
- The length of the vectors is a hyperparameter that needs to be specified before training the neural network.
- The dimensionality of the word embeddings determines the amount of information that can be captured about each word.

**When called, it converts the sequences of word indices to sequences of vectors**:
- The embedding layer takes as input a sequence of word indices, where each index represents a word in the input sequence.
- It then looks up the corresponding word embeddings for each word index and returns a sequence of vectors.
- This conversion from word indices to vectors is done by performing a table lookup operation, where the word index is used as an index to retrieve the corresponding word embedding from a lookup table.

**These vectors are trainable**:
- The word embeddings in the embedding layer are trainable parameters of the neural network.
- During the training process, the neural network adjusts the values of the word embeddings based on the objective function and the gradients computed during backpropagation.
- This allows the neural network to learn meaningful representations for the words in the input sequence that are useful for the task at hand.

**After training (on enough data), words with similar meanings often have similar vectors**:
- One of the key advantages of using an embedding layer is that it allows words with similar meanings to have similar vector representations.
- This property is learned during the training process, where the neural network is exposed to a large amount of data and learns to associate words that appear in similar contexts with similar vector representations.
- As a result, words that are semantically related or have similar meanings tend to have similar vector representations in the embedding space.
- This can be useful for various natural language processing tasks, such as word similarity measurement, word analogy completion, and sentiment analysis.

**Recurrent Neural Network (RNN)**

- A recurrent neural network (RNN) is a type of artificial neural network that is designed to process sequential data.
- Sequential data refers to data that has a specific order or sequence, such as time series data or sentences in natural language processing.
- RNNs are particularly useful for tasks that involve processing and understanding sequential patterns, such as speech recognition, language translation, and sentiment analysis.

**Processing Sequence Input**

- When an RNN processes sequence input, it does so by iterating through the elements of the sequence one by one.
- Each element of the sequence is considered at a specific timestep, which represents a specific point in the sequence.
- The RNN starts by processing the first element of the sequence at the first timestep, then moves on to the second element at the second timestep, and so on.

**Passing Outputs to Inputs**

- One key characteristic of RNNs is that they pass the outputs from one timestep to their input on the next timestep.
- This means that the output of the RNN at a particular timestep becomes part of the input for the next timestep.
- By doing this, the RNN is able to capture and remember information from previous timesteps, allowing it to learn and understand the sequential patterns in the data.

**Example**

- Let's consider an example of using an RNN for language translation.
- Suppose we have a sentence in English, "I love cats," and we want to translate it into French.
- The RNN would process the sentence one word at a time, starting with the word "I" at the first timestep.
- The output of the RNN at the first timestep would then become part of the input for the second timestep, where the word "love" is processed.
- This process continues until the entire sentence has been processed, and the RNN outputs the translated sentence in French.

**Summary**

- In summary, an RNN is a type of neural network that is designed to process sequential data.
- It does so by iterating through the elements of the sequence one by one at different timesteps.
- The outputs of the RNN at each timestep are passed as inputs to the next timestep, allowing the network to capture and remember information from previous timesteps.

## Create the text encoder

In [10]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))













In [11]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [12]:
print(tf.__version__)

2.15.0


## Create the Model
<img src="https://www.tensorflow.org/static/text/tutorials/images/bidirectional.png" alt="A drawing of the information flow in the model">

In [13]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [14]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [15]:
# predict on a sample text without padding.

sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[-0.00681133]


In [16]:
# predict on a sample text with padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[-0.00681133]


# Train the model

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10






Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)


In [None]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))

## Stack two or more LSTM layers

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')