In [9]:
import torch
import math
from torch import nn
import torch.nn.functional as F

## scaled dot production-1st time

In [10]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2))/math.sqrt(d_k)
    print(f"scaled.size(): {scaled.size()}")
    if mask is not None:
        scaled += mask
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return attention, values

# scaled dot production- 2nd time

## what is the scaled dot production attention mechanism used for?
### breifly speaking, it's quite important features in transformer to weight the different part of input when generating the output
## why scaled value is sqrt(d_qkv)?
### 1.it's an experimental value, practically, this denominator can make the value to be in the normal ditribution after dot product
### 2.the scaling prevent softmax value from too steep, which can happen due to the large value resulting from the dot product operation

In [11]:
def scaled_dot_production(q, k, v, mask=None):
    d_qkv = q.shape[-1]
    scaled = torch.matmul(q, k.transpose(-1,-2))/torch.sqrt(d_qkv)
    if mask:
        scaled+=mask
    attention = F.softmax(scaled)
    value = torch.matmul(v, attention)
    return attention, value

## MultiHeadAttention

In [12]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model//num_heads
        self.qkv_layer = nn.Linear(d_model, 3*d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        bs, max_sequence_length, d_model = x.size()
        print(f"x.size: {x.size()}")
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(bs, max_sequence_length, self.num_heads, 3*self.head_dim)
        qkv = qkv.permute(0,2,1,3)
        q, k, v = qkv.chunk(3)
        attention, values = scaled_dot_product(q, k, v, mask)
        values = values.reshape(bs, max_sequence_length, self.num_heads*self.head_dim)
        out = self.linear_layer(values)
        return out

# MultiAttention head-2nd time
## why?

In [13]:
class MultiHeadAttention_v2(nn.Module):
    def __init__(self,
                 input_dim,
                 d_model,
                 num_head,
                 ):
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_head = num_head
        self.head_dim = d_model//num_head
        self.qkv_layer = nn.Linear(input_dim, 3*d_model)
        self.fc = nn.Linear(d_model, d_model)

    def forward(self, x):
        bs, max_seq, input_dim = x.shape
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(bs, max_seq, self.num_head, 3*self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        attn, value = scaled_dot_product(q, k, v)
        # concatenate
        value = value.reshape(bs, max_seq, self.num_head*self.d_model)
        out = self.fc(value)
        return out

# positional encoding

In [14]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq):
        super().__init__()
        self.max_seq = max_seq
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        odd_i = torch.arange(1, self.d_model, 2).float()
        even_denominator = torch.pow(10000, even_i/self.d_model)
        odd_denominator = torch.pow(10000, (odd_i-1)/self.d_model)
        position = torch.arange(self.max_seq).reshape(self.max_seq, 1)
        even_PE = torch.sin(position/even_denominator)
        odd_PE = torch.cos(position/odd_denominator)
        PE = torch.concat((even_PE, odd_PE), dim=1)
        return PE
    
pe = PositionalEncoding(6,10)
pe.forward()

tensor([[ 0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  1.0000],
        [ 0.8415,  0.0464,  0.0022,  0.5403,  0.9989,  1.0000],
        [ 0.9093,  0.0927,  0.0043, -0.4161,  0.9957,  1.0000],
        [ 0.1411,  0.1388,  0.0065, -0.9900,  0.9903,  1.0000],
        [-0.7568,  0.1846,  0.0086, -0.6536,  0.9828,  1.0000],
        [-0.9589,  0.2300,  0.0108,  0.2837,  0.9732,  0.9999],
        [-0.2794,  0.2749,  0.0129,  0.9602,  0.9615,  0.9999],
        [ 0.6570,  0.3192,  0.0151,  0.7539,  0.9477,  0.9999],
        [ 0.9894,  0.3629,  0.0172, -0.1455,  0.9318,  0.9999],
        [ 0.4121,  0.4057,  0.0194, -0.9111,  0.9140,  0.9998]])

In [16]:


def scaled_dot_product(q,k,v):
    """
    q.shape = b, maxsequence, head_dim
    """
    q_d = q.shape[-1]
    scaled_qk = torch.matmul(q, k.transpose(-1,-2))/math.sqrt(q_d)
    atten = F.softmax(scaled_qk,dim=-1)
    value = torch.matmul(atten, v)
    return atten, value

class MultiHeadAttention(nn.modules):
    def __init__(self,
                 input_dim,
                 d_model,
                 num_head,
                 max_squence):
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_head = num_head
        self.head_dim = d_model/num_head
        self.qkv_layer = nn.Linear(input_dim, 3*d_model)
        self.fc = nn.Linear(d_model, d_model)

    def forward(self, x):
        bs, max_squence, input_dim = x.shape
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(bs, max_squence, self.num_head, 3*self.head_dim)
        qkv = qkv.permute(0,2,1,3)
        q, k, v = qkv.chunk(3, dim=-1)
        atten, value = scaled_dot_product(q, k, v)
        value = value.reshape(bs, max_squence, self.num_head*self.head_dim)
        out = self.fc(value)

        return out


def PositionalEncode(x):
    bs, max_seq, d_model = x.shape
    postion = torch.arange(0, max_seq, 1)
    even_i = torch.arange(0, d_model, 2)
    odd_i = torch.arange(1, d_model, 2)
    odd_denominator = torch.pow(10000, odd_i)
    even_denominator = torch.pow(10000, even_i)
    odd_PE = torch.sin(postion/odd_denominator)
    even_PE = torch.cos(postion/even_denominator)



TypeError: module() takes at most 2 arguments (3 given)

In [26]:
x = torch.rand(4, 5, 16)
bs, max_seq, d_model = x.shape
postion = torch.arange(0, max_seq, 1)
even_i = torch.arange(0, d_model, 2)
odd_i = torch.arange(1, d_model, 2)
odd_denominator = torch.pow(10000, odd_i/d_model)
even_denominator = torch.pow(10000, even_i/d_model)
postion = postion.reshape(max_seq, 1)
odd_PE = torch.sin(postion/odd_denominator)
even_PE = torch.cos(postion/even_denominator)
PE = torch.concat([odd_PE, even_PE], dim=-1)

In [28]:
x = torch.zeros((max_seq,d_model))

In [29]:
x.shape

torch.Size([5, 16])

In [27]:
PE.shape

torch.Size([5, 16])

In [18]:
a = torch.rand((3,4,5,6))

In [19]:
b = a.permute(0,2,1,3).reshape(3,5,24)

In [20]:
c = a.reshape(3,5,24)

In [22]:
b.shape

torch.Size([3, 5, 24])

In [None]:
def scaled_dot_product(q,k,v):
    q_d = q.shape[-1]
    qk_scaled = torch.matmul(q, k.transpose(-1,-2))/math.sqrt(q_d)
    atten = F.softmax(qk_scaled)
    value = torch.matmul(atten, v)
    return atten, value

class MultiHeadAttention(nn.modules):
    def __init__(self,
                 input_dim,
                 d_model,
                 max_sequence,
                 num_head
                 ):
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_head = num_head
        self.head_dim = d_model/num_head

        self.max_sequence = max_sequence
        self.qkv_layer = nn.Linear(input_dim, 3*self.d_model)
        self.fc = nn.Linear(d_model, d_model)

    def forward(self, x):
        bs, max_seq, input_dim = x.shape
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(bs, max_seq, self.num_head, 3*self.head_dim)
        qkv = qkv.permute(0, 2,1,3)
        q,k,v = qkv.chunk(3, dim=-1)
        atten, value = scaled_dot_product(q, k, v)
        value = qkv.permute(0,2, 1,3).reshape(bs,)