In [1]:
import numpy as np

In [2]:
# Define softmax operation that works independently on each column
def softmax_cols(data_in):
    # Exponentiate all of the values
    exp_values = np.exp(data_in)
    # Sum over columns
    denom = np.sum(exp_values, axis=0)
    # Replicate denominator to N rows
    denom = np.matmul(np.ones((data_in.shape[0], 1)), denom[np.newaxis, :])
    # Compute softmax
    softmax = exp_values / denom
    # return the answer
    return softmax

In [3]:
 # Now let's compute self attention in matrix form
def self_attention(X, omega_v, omega_q, omega_k, beta_v, beta_q, beta_k):
    # 1. Compute queries, keys, and values
    queries = np.matmul(omega_q, X) + beta_q
    keys = np.matmul(omega_k, X) + beta_k
    values = np.matmul(omega_v, X) + beta_v

    # 2. Compute dot products
    transposed_keys = np.transpose(keys)
    dot_product = np.matmul(transposed_keys, queries)

    # 3. Apply softmax to calculate attentions
    attention = softmax_cols(dot_product)

    # 4. Weight values by attentions
    X_prime = np.matmul(values, attention)

    return X_prime

In [4]:
# Set seed so we get the same random numbers
np.random.seed(3)
# Number of inputs
N = 3
# Number of dimensions of each input
D = 4
# Create an empty list
all_x = []
# Create elements x_n and append to list
for n in range(N):
    all_x.append(np.random.normal(size=(D, 1)))

In [5]:
# Set seed so we get the same random numbers
np.random.seed(0)

# Choose random values for the parameters
omega_q = np.random.normal(size=(D, D))
omega_k = np.random.normal(size=(D, D))
omega_v = np.random.normal(size=(D, D))
beta_q = np.random.normal(size=(D, 1))
beta_k = np.random.normal(size=(D, 1))
beta_v = np.random.normal(size=(D, 1))

In [6]:
# Copy data into matrix
X = np.zeros((D, N))
X[:, 0] = np.squeeze(all_x[0])
X[:, 1] = np.squeeze(all_x[1])
X[:, 2] = np.squeeze(all_x[2])

# Run the self attention mechanism
X_prime = self_attention(X, omega_v, omega_q, omega_k, beta_v, beta_q, beta_k)

# Print out the results
print(X_prime)

[[ 0.94744244  1.64201168  1.61949281]
 [-0.24348429 -0.08470004 -0.06641533]
 [-0.91310441  4.02764044  3.96863308]
 [-0.44522983  2.18690791  2.15858316]]


The values are quite extreme (one is very close to one and the others are very close to zero. Now we'll fix this problem by using scaled dot-product attention.

In [8]:
# Now let's compute self attention in matrix form
def scaled_dot_product_self_attention(X, omega_v, omega_q, omega_k, beta_v, beta_q, beta_k):
    # 1. Compute queries, keys, and values
    queries = np.matmul(omega_q, X) + beta_q
    keys = np.matmul(omega_k, X) + beta_k
    values = np.matmul(omega_v, X) + beta_v

    # 2. Compute dot products
    transposed_keys = np.transpose(keys)
    dot_product = np.matmul(transposed_keys, queries)

    # 3. Scale the dot products as in equation 12.9
    scaled_dot_product = dot_product / np.sqrt(keys.shape[0])

    # 4. Apply softmax to calculate attentions
    attention = softmax_cols(scaled_dot_product)

    # 5. Weight values by attentions
    X_prime = np.matmul(values, attention)

    return X_prime

In [9]:
# Run the self attention mechanism
X_prime = scaled_dot_product_self_attention(X, omega_v, omega_q, omega_k, beta_v, beta_q, beta_k)

# Print out the results
print(X_prime)

[[ 0.97411966  1.59622051  1.32638014]
 [-0.23738409 -0.09516106  0.13062402]
 [-0.72333202  3.70194096  3.02371664]
 [-0.34413007  2.01339538  1.6902419 ]]
