In [None]:
import torch
from torch import nn
from torch.nn import functional as F

In [None]:
def multihead_attention(Q, K, V, num_heads, mask=None):
    # Calculate the dot product of the query and key matrices, and divide
    # by the square root of the number of columns in the key matrix
    similarities = torch.matmul(Q, K.transpose()) / math.sqrt(K.size(-1))

    # Apply the mask, if provided
    if mask is not None:
        similarities = similarities.masked_fill(mask == 0, -1e9)

    # Apply the softmax function to convert the similarities into weights
    weights = F.softmax(similarities, dim=-1)

    # Calculate the weighted sum of the value vectors
    output = torch.matmul(weights, V)

    # Split the output into `num_heads` heads and concatenate them
    output = output.split(split_size=num_heads, dim=0)
    output = torch.cat(output, dim=-1)

    # Project the concatenated result back to the original size of the value matrix
    output = linear(output, V.size(-1))

    return output

In [None]:
# Define the dimensions of the matrices
n_rows = 10
n_cols = 20
num_heads = 8

# Create some random query, key, and value matrices
Q = torch.randn(n_rows, n_cols)
K = torch.randn(n_rows, n_cols)
V = torch.randn(n_rows, n_cols)

# Run the multi-head attention function
output = multihead_attention(Q, K, V, num_heads)

# Print the output
print(output)


TypeError: transpose() received an invalid combination of arguments - got (), but expected one of:
 * (int dim0, int dim1)
 * (name dim0, name dim1)


In [None]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, input_dim, output_dim, dropout_rate=0.0):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dropout_rate = dropout_rate

        self.input_linear = nn.Linear(input_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input):
        # Split the input into multiple heads
        input_heads = torch.chunk(input, self.num_heads, dim=2)

        # Compute the output for each head
        output_heads = [self.input_linear(head) for head in input_heads]

        # Concatenate the output heads
        output = torch.cat(output_heads, dim=2)

        # Apply the dropout layer
        output = self.dropout(output)

        return output

In [None]:
# Create a random input tensor with 3 heads and 64 input and output dimensions
input = torch.randn(1, 10, 192)

# Create the multi-head attention layer with 3 heads, 64 input and output dimensions, and a dropout rate of 0.1
attention = MultiHeadAttention(3, 64, 64, 0.1)

# Apply the attention layer to the input
output = attention(input)

In [None]:
output.shape

torch.Size([1, 10, 192])

`embed_size`: refers to the size of the vectors used to represent words

- Why `self.values` maps to `nn.Linear`?
    + This calculation is useful because it allows the self-attention mechanism to learn a more complex function that maps the input values to a new representation that is more useful for computing the self-attention weights.

To explain what query, values, and keys represent in the code you provided, let's use the analogy of a recipe book. In this analogy, the query tensor can be thought of as a list of ingredients that a chef is looking for in a recipe. The values tensor can be thought of as a collection of recipes in the recipe book. And the keys tensor can be thought of as a list of ingredients for each recipe in the collection.

When the chef is looking for a recipe, she can use the query tensor to identify the relevant recipes in the values tensor using the keys tensor. This is similar to how the self-attention mechanism uses the query, values, and keys tensors to compute the self-attention weights. The self-attention weights are then used to compute a weighted sum of the values tensor, where the weightings are determined by the similarity between the query (the ingredients the chef is looking for) and the keys (the ingredients in each recipe) associated with each value in the values tensor (the recipes in the recipe book).

In the code you provided, the query tensor is used to compute the self-attention weights, the values tensor is used as the collection of values to be weighted, and the keys tensor is used to determine the weightings for each value in the values tensor. These tensors are used together to compute the output of the self-attention mechanism, which is a weighted sum of the values tensor.

Questions:
- What's `head_dim`? Why calculate it this way?
- 

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = self.values(values)  # (N, value_len, embed_size)
        keys = self.keys(keys)  # (N, key_len, embed_size)
        queries = self.queries(query)  # (N, query_len, embed_size)

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out

In [None]:
nn.LazyLinear

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mLazyLinear[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mout_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mLazyLinear[0m[0;34m([0m[0mLazyModuleMixin[0m[0;34m,[0m [0mLinear[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""A :class:`torch.nn.Linear` module where `in_features` is inferred.[0m
[0;34m[0m
[0;34m    In this module, the `weight` and `bias` are of :class:`torch.nn.UninitializedParameter`[0m
[0;34m    class. They will be initialized after the first call to ``forward`` is done and the[0m
[0;34m    