In [None]:
# Import our standard libraries.
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns  # for nicer plots
sns.set(style='darkgrid')  # default style
import tensorflow as tf
import pandas as pd

## Understanding the Embeddings Layer

In [None]:
# Input shape:  (batch_size, input_length)
# Output shape: (batch_size, input_length, output_dim)
embeddings = tf.keras.layers.Embedding(
    input_dim = 100,  # size of feature vocabulary
    output_dim = 2,   # embedding dimension
    input_length = 5  # number of inputs
    )

In [None]:
# Get embeddings for the input ids [0, 1, 2, 3, 4]
data = tf.constant([0, 1, 2, 3, 4], shape=(1, 5))
embed_data = embeddings(data)
embed_data.numpy()

In [None]:
# Average embeddings
embed_data_average = tf.keras.layers.GlobalAveragePooling1D()(embed_data)
embed_data_average.numpy()

## Embeddings for Text

Let's store our small set of movie reviews and their labels in numpy arrays

In [None]:
X_train = np.array([
                    'This movie was amazing',
                    'I have seen it 8 times !',
                    'I fell asleep',
                    'I would not recommend it',
                    'It was absolutely awful',
                    'I would watch it again !'
                  ])

Y_train = np.array([
                    1,
                    1,
                    0,
                    0,  
                    0, 
                    1
                  ])


In [None]:
display(X_train[0])

In [None]:
max_sequence_length = 6

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=None, # Maximum vocabulary size - None -> no cap
    standardize='lower_and_strip_punctuation', # Standarization to apply to text - None -> no standarization
    split="whitespace", # Values can be None (no splitting), "whitespace", or a Callable
    output_mode='int',  # Values can be "int", "multi_hot", "count" or "tf_idf"
    output_sequence_length=max_sequence_length, # Only valid in INT mode. If set, the output will have its time dimension padded or truncated to exactly output_sequence_length values
    )

vectorize_layer.adapt(X_train)

display("--Vocabulary--")
for i, token in enumerate(vectorize_layer.get_vocabulary()):
  display('%d: %s' %(i, token))

# 0: ('') - Padding Token
# 1: ('[UNK]') - OOV Token

In [None]:
X_train_vectorized = vectorize_layer(X_train)

display(X_train_vectorized)

In [None]:
vocab_size = len(vectorize_layer.get_vocabulary())

# Input shape:  (batch_size, input_length)
# Output shape: (batch_size, input_length, output_dim)
embedding_layer = tf.keras.layers.Embedding(
    input_dim = vocab_size,  # size of feature vocabulary
    output_dim = 3,   # embedding dimension
    input_length = max_sequence_length  # number of inputs
    )

first_review_embed_rep = embedding_layer(X_train_vectorized[0])
display(first_review_embed_rep)

In [None]:
def build_model():
  tf.keras.backend.clear_session()
  tf.random.set_seed(0)

  model = tf.keras.Sequential()
  model.add(vectorize_layer)
  model.add(tf.keras.layers.Embedding(
      input_dim = vocab_size,  # size of feature vocabulary
      output_dim = 2,  # embedding dimension
      input_length = max_sequence_length  # number of inputs
      ))

  # Average over the sequence dimension, so each review is represented by 
  # 1 vector of size embedding_dimension
  model.add(tf.keras.layers.GlobalAveragePooling1D()) 

  # Alternatively, we could concatenate the embedding representations of 
  # all tokens in the movie review
  #model.add(tf.keras.layers.Flatten())

  model.add(tf.keras.layers.Dense(
      units=8,        
      activation='relu'))

  model.add(tf.keras.layers.Dense(
      units=1,        
      activation='sigmoid'))

  model.compile(loss='binary_crossentropy', 
                optimizer='adam',
                metrics=['accuracy'])
  
  return model

In [None]:
model = build_model()

# Display the model layers.
display(model.layers)
display(model.summary())

# Retrieve the embeddings layer, which itself is wrapped in a list.
embeddings = model.layers[1].get_weights()[0]
display("Embeddings layer - shape: ", embeddings.shape)
display("Embeddings layer - parameter matrix (before training): ", embeddings)

In [None]:
def plot_embeddings(model):
  embeddings = model.layers[1].get_weights()[0]
  plt.scatter(embeddings[:,0], embeddings[:,1])
  for i, token in enumerate(vectorize_layer.get_vocabulary()):
    plt.annotate(token, (embeddings[i]))
  plt.show()

In [None]:
plot_embeddings(model)

In [None]:
display(X_train)
display(model.predict(X_train))

In [None]:
history = model.fit(
  x = X_train,  # our sparse padded training data
  y = Y_train,  # corresponding binary labels
  epochs=15,    # number of passes through the training data
  verbose=0     # display some progress output during training
  )

plot_embeddings(model)

In [None]:
display(X_train)
display(model.predict(X_train))