In [None]:
    # Adapted from https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb

# Understanding the Difference Between Embedding Layers and Linear Layers
Embedding layers in Tensorflow/PyTorch accomplish the same as linear layers that perform matrix multiplications; the reason we use embedding layers is computational efficiency
We will take a look at this relationship step by step using code examples



In [3]:
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf

In [4]:
# Suppose we have the following 3 training examples,
# which may represent token IDs in a LLM context
idx = tf.constant([2, 3, 1])

# The number of rows in the embedding matrix can be determined
# by obtaining the largest token ID + 1.
# If the highest token ID is 3, then we want 4 rows, for the possible
# token IDs 0, 1, 2, 3
num_idx = tf.reduce_max(idx) + 1

# # The desired embedding dimension is a hyperparameter
out_dim = 5

In [7]:
tf.random.set_seed(123)
fake_weights = np.array([
  [0.00, 0.00, 0.00, 0.00, 0.00], # 0
  [0.11, 0.11, 0.11, 0.11, 0.11], # 1
  [0.22, 0.22, 0.22, 0.22, 0.22], # 2
  [0.33, 0.33, 0.33, 0.33, 0.33], # 3
])
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding
embedding = tf.keras.layers.Embedding(name='my_embedding', input_dim=num_idx, output_dim=out_dim , weights=[fake_weights])
model = tf.keras.Sequential([embedding])
model.compile()
print("embedding weights:\n", embedding.get_weights())

embedding weights:
 [array([[0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.11, 0.11, 0.11, 0.11, 0.11],
       [0.22, 0.22, 0.22, 0.22, 0.22],
       [0.33, 0.33, 0.33, 0.33, 0.33]], dtype=float32)]


In [8]:
# here is the embedding of the idx vector, using a normal Embedding layer:
embedding(idx)


<tf.Tensor: shape=(3, 5), dtype=float32, numpy=
array([[0.22, 0.22, 0.22, 0.22, 0.22],
       [0.33, 0.33, 0.33, 0.33, 0.33],
       [0.11, 0.11, 0.11, 0.11, 0.11]], dtype=float32)>

# Using a Linear (Dense) Layer
Now, we will demonstrate that the embedding layer above accomplishes exactly the same as a Dense layer on a one-hot encoded representation in Tensorflow.
First, let's convert the token IDs into a one-hot representation:

In [9]:
onehot = tf.one_hot(idx, num_idx)
print("onehot:\n", onehot)

onehot:
 tf.Tensor(
[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]], shape=(3, 4), dtype=float32)


In [10]:
# Load the same weights into the Dense layer
linear = tf.keras.layers.Dense(name='my_dense', units=out_dim, use_bias=False, input_shape=(3,4), weights=[fake_weights])
model = tf.keras.Sequential([linear])
model.compile()
linear.get_weights()

[array([[0.  , 0.  , 0.  , 0.  , 0.  ],
        [0.11, 0.11, 0.11, 0.11, 0.11],
        [0.22, 0.22, 0.22, 0.22, 0.22],
        [0.33, 0.33, 0.33, 0.33, 0.33]], dtype=float32)]

In [11]:
# This produces an identical embedding:
linear(onehot)

<tf.Tensor: shape=(3, 5), dtype=float32, numpy=
array([[0.22, 0.22, 0.22, 0.22, 0.22],
       [0.33, 0.33, 0.33, 0.33, 0.33],
       [0.11, 0.11, 0.11, 0.11, 0.11]], dtype=float32)>

- Since all but one index in each one-hot encoded row are 0 (by design), this matrix multiplication is essentially the same as a look-up of the one-hot elements
- This use of the matrix multiplication on one-hot encodings is equivalent to the embedding layer look-up but can be inefficient if we work with large embedding matrices, because there are a lot of wasteful multiplications by zero