## Use pytorch to code Self-Attention in Transformer

In [43]:
import torch
torch.cuda.is_available()

True

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class Transformer_SelfAttention(nn.Module):
    def __init__(self, dimentiona_model=2, row_dimension=0, column_dimension=1):
        super().__init__()
        self.weight_query = nn.Linear(in_features=dimentiona_model, out_features=dimentiona_model, bias=False)
        self.weight_key = nn.Linear(in_features=dimentiona_model, out_features=dimentiona_model, bias=False)
        self.weight_value = nn.Linear(in_features=dimentiona_model, out_features=dimentiona_model, bias=False)
        self.row_dimension = row_dimension
        self.column_dimension = column_dimension

    def forward(self, token_encodings):
        q = self.weight_query(token_encodings)
        k = self.weight_key(token_encodings)
        v = self.weight_value(token_encodings)
        similarity_scores = torch.matmul(q, k.transpose(dim0=self.row_dimension, dim1=self.column_dimension))
        scaled_similarity_scores = similarity_scores / torch.tensor(k.size(self.column_dimension)**0.5)
        attention_percents = F.softmax(scaled_similarity_scores, dim=self.column_dimension) #column wise softmax is applied 
        attention_scores = torch.matmul(attention_percents, v)
        return attention_scores


In [25]:
encoding_matrix = torch.tensor([[1.16, 0.23],
                                 [0.57, 1.36],
                                 [4.41, -2.16]])
torch.manual_seed(42)
tsa = Transformer_SelfAttention(dimentiona_model=2, row_dimension=0, column_dimension=1)
tsa(encoding_matrix)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [42]:
#print out weight, key, value matrix that creates the queries, keys, values
print(f"""Weight matrix values for queries, keys and values are as follows:
      weight:\n {tsa.weight_query.weight.transpose(0,1)}
      key:\n {tsa.weight_key.weight.transpose(0,1)}
      value:\n {tsa.weight_value.weight.transpose(0,1)}\n""") 

#calculate queries, keys, values
print(f"""Calculate Queries, Keys and Values are as follows:
      Queries:\n {tsa.weight_query(encoding_matrix)}
      Keys:\n {tsa.weight_key(encoding_matrix)}
      Values:\n {tsa.weight_value(encoding_matrix)}\n""")   
 
# Compute Queries, Keys, and Values
q = tsa.weight_query(encoding_matrix)
k = tsa.weight_key(encoding_matrix)
v = tsa.weight_value(encoding_matrix)  # Correct usage

# Compute Similarity Scores
similarity_scores = torch.matmul(q, k.transpose(0, 1))

# Compute Scaled Similarity Scores
scaling_factor = torch.tensor(k.size(1)**0.5, dtype=torch.float32)
scaled_similarity_scores = similarity_scores / scaling_factor

# Compute Attention Percents using Softmax
attention_percents = F.softmax(scaled_similarity_scores, dim=1)

# Compute Attention Scores
attention_scores = torch.matmul(attention_percents, v)

# Print Results
print(f"""Similarity, Scaled Similarity, Attention Percents, and Attention Scores are as follows:

Similarity Scores:
{similarity_scores}

Scaled Similarity Scores:
{scaled_similarity_scores}

Attention Percents:
{attention_percents}

Attention Scores:
{attention_scores}""")



Weight matrix values for queries, keys and values are as follows:
      weight:
 tensor([[ 0.5406, -0.1657],
        [ 0.5869,  0.6496]], grad_fn=<TransposeBackward0>)
      key:
 tensor([[-0.1549, -0.3443],
        [ 0.1427,  0.4153]], grad_fn=<TransposeBackward0>)
      value:
 tensor([[ 0.6233,  0.6146],
        [-0.5188,  0.1323]], grad_fn=<TransposeBackward0>)

Calculate Queries, Keys and Values are as follows:
      Queries:
 tensor([[ 0.7621, -0.0428],
        [ 1.1063,  0.7890],
        [ 1.1164, -2.1336]], grad_fn=<MmBackward0>)
      Keys:
 tensor([[-0.1469, -0.3038],
        [ 0.1057,  0.3685],
        [-0.9914, -2.4152]], grad_fn=<MmBackward0>)
      Values:
 tensor([[ 0.6038,  0.7434],
        [-0.3502,  0.5303],
        [ 3.8695,  2.4246]], grad_fn=<MmBackward0>)

Similarity, Scaled Similarity, Attention Percents, and Attention Scores are as follows:

Similarity Scores:
tensor([[-0.0990,  0.0648, -0.6523],
        [-0.4022,  0.4078, -3.0024],
        [ 0.4842, -0.6683,  4