In [2]:
import torch.nn as nn
import torch
from torch.autograd import Variable
import math
import numpy as np
import torch.nn.functional as F

#import matplotlib as plt

In [11]:
class Embeddings(nn.Module):
    def __init__(self, dim, vocab_size):
        super(Embeddings,self).__init__() # == super().__init__() 
        self.lut = nn.Embedding(vocab_size,dim)
        self.d_model = dim
    
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [12]:
attention_size = (1,6,6)
subseq_mask = np.triu(np.ones(attention_size),k=1)
subseq_mask

array([[[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]]])

In [13]:
def subsequent_mask(size):
    attention_size = (1,size,size)
    subseq_mask = np.triu(np.ones(attention_size),k=1)
    return torch.from_numpy(1-subseq_mask)

In [14]:
trid = subsequent_mask(6)
trid,trid.shape

(tensor([[[1., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1., 1.]]], dtype=torch.float64),
 torch.Size([1, 6, 6]))

In [15]:
trid[0].shape

torch.Size([6, 6])

In [16]:
embedding = nn.Embedding(10,3)
type(embedding)


torch.nn.modules.sparse.Embedding

In [17]:
embedding.weight

Parameter containing:
tensor([[ 0.6451, -0.1286, -0.6585],
        [-1.1187, -0.1429,  0.0454],
        [ 0.7862, -0.8243, -2.0834],
        [-0.3729,  0.5265, -0.6919],
        [-0.1460,  0.6052, -1.1296],
        [-0.0972,  0.1188,  0.3157],
        [ 0.0950, -0.7536, -0.9331],
        [-0.6951, -0.9528, -0.5592],
        [-1.2246, -0.5633,  1.7734],
        [ 0.7964,  1.1891,  0.5069]], requires_grad=True)

In [9]:
inputs = torch.LongTensor([[1,2,3,4],[0,6,7,8]])
inputs.shape

torch.Size([2, 4])

In [18]:
emb = embedding(inputs)

In [19]:
emb

tensor([[[-1.1187, -0.1429,  0.0454],
         [ 0.7862, -0.8243, -2.0834],
         [-0.3729,  0.5265, -0.6919],
         [-0.1460,  0.6052, -1.1296]],

        [[ 0.6451, -0.1286, -0.6585],
         [ 0.0950, -0.7536, -0.9331],
         [-0.6951, -0.9528, -0.5592],
         [-1.2246, -0.5633,  1.7734]]], grad_fn=<EmbeddingBackward0>)

In [20]:
emb.shape

torch.Size([2, 4, 3])

In [21]:
x = Variable(torch.LongTensor([[1,2,3,4],[0,6,7,8]]))

In [22]:
x

tensor([[1, 2, 3, 4],
        [0, 6, 7, 8]])

In [23]:
type(x),type(inputs)

(torch.Tensor, torch.Tensor)

In [77]:
x = Variable(torch.LongTensor([[1,2,3,4],[0,6,7,8]]))
d_model =512
vocab_size = 10000
embs = Embeddings(d_model,vocab_size= vocab_size)
inputs_emb = embs(x)
query = key = value = inputs_emb

In [78]:
inputs_emb.dtype

torch.float32

In [79]:
inputs_emb.shape[-1],inputs_emb.size(),inputs_emb.size()[-1]

(512, torch.Size([2, 4, 512]), 512)

In [80]:
def attention(query, key, value, mask = None, dropout = None):
    d_k = query.size()[-1]
    scores = torch.matmul(query, key.transpose(-2,-1))/math.sqrt(d_k) #key last dim exchange with last 2nd dim
    print(scores.size())
    if mask is not None:
        scores = scores.masked_fill(mask==0, -1e9)
    attn = F.softmax(scores, dim = -1)
    print(attn.size())
    if dropout is not None:
        attn = dropout(attn)
    return torch.matmul(attn, value), attn

In [81]:
query = key = value = inputs_emb

In [82]:
value.size()

torch.Size([2, 4, 512])

In [83]:
values, attn = attention(query,key,value)

torch.Size([2, 4, 4])
torch.Size([2, 4, 4])


In [84]:
values.shape

torch.Size([2, 4, 512])

In [85]:
attn.shape

torch.Size([2, 4, 4])

In [86]:
import copy
def clones(module, n = 1):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])


In [88]:
class MultiHeadAttention(nn.Module):
    def __init__(self, head, embedding_dim, dropout = 0.1):
        super().__init__()
        assert embedding_dim % head == 0
        self.d_k =  embedding_dim// head
        self.head = head
        self.linears = clones(nn.Linear(embedding_dim, embedding_dim), 4)
        self.attn = None
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, query, key, value, mask = None):
        # if mask
        if mask is not None:
            mask = mask.unsqueeze(0) # expand the dim of mask? represent nth head in multihead
        batch_size = query.size()[0]
        query, key, value = [model(x).view(batch_size, -1, self.head,self.d_k).transpose(1,2) for model, x in zip(self.linears, (query, key, value))] # transpose -1 means length of sentence, finally, last two dims are length of sentence and word emb dim
        x, self.attn = attention(query, key, value, mask = mask, dropout = self.dropout)
        x = x.transpose(1,2).contiguous().view(batch_size, -1, self.d_k*self.head)
        return self.linears[-1](x)            
        

In [89]:
head = 8
embedding_dim = 512
dropout = 0.2
#input
query = key = value = inputs_emb
mask = Variable(torch.zeros(8,4,4)) # number of head, matrix dim
mask1 = mask.unsqueeze(0)

In [36]:
linears = clones(nn.Linear(embedding_dim, embedding_dim), 4)
batch_size = query.size()[0]

In [37]:
batch_size, query.shape

(2, torch.Size([2, 4, 512]))

In [38]:
aa = linears[0](query)

In [39]:
bb = aa.view(batch_size, -1, head,embedding_dim//head)

In [40]:
bb.shape

torch.Size([2, 4, 8, 64])

In [41]:
query1, key1, value1 = [model(x).view(batch_size, -1, head,embedding_dim//head).transpose(1,2) for model, x in zip(linears, (query, key, value))] 

In [58]:
query1.shape

torch.Size([2, 8, 4, 64])

In [29]:
query.shape,query1.shape

(torch.Size([2, 4, 512]), torch.Size([2, 8, 4, 64]))

In [30]:
mask1.shape

torch.Size([1, 8, 4, 4])

In [90]:
def attention(query, key, value, mask = None, dropout = None):
    d_k = query.size()[-1]
    #print(query.size())
    scores = torch.matmul(query, key.transpose(-2,-1))/math.sqrt(d_k) #key last dim exchange with last 2nd dim
    #print(scores.size())
    if mask is not None:
        #print(mask.size())
        scores = scores.masked_fill(mask==0, -1e9)
        #print(scores)
    attn = F.softmax(scores, dim = -1)
    #print(attn.size())
    if dropout is not None:
        attn = dropout(attn)
    return torch.matmul(attn, value), attn

In [32]:
query1.shape == key1.shape

True

In [91]:
x, attn = attention(query1, key1, value1, mask = mask1)

In [92]:
len(linears)

4

In [93]:
query.shape

torch.Size([2, 4, 512])

In [94]:

mask.shape

torch.Size([8, 4, 4])

In [95]:
mha = MultiHeadAttention(head, embedding_dim, dropout)

In [96]:
mha_result = mha(query,key,value,mask)

In [97]:
mha_result

tensor([[[-8.8329,  6.1361,  2.9585,  ...,  1.2148,  4.6935,  3.1532],
         [-6.2393,  5.5863,  8.1964,  ...,  2.0062,  0.5491, -0.3885],
         [-7.3728,  5.7970,  3.0365,  ...,  1.6352,  7.0970,  2.4296],
         [-5.3376,  5.8210,  5.1375,  ...,  0.0529,  5.9840,  5.0678]],

        [[-4.0066,  0.9095,  2.0274,  ...,  2.2734, -3.3347,  0.4291],
         [-5.4267, -0.2141, -1.4938,  ...,  1.0425, -1.9053,  0.6388],
         [ 0.2700,  3.5488,  2.8186,  ...,  0.5663, -7.8385,  3.2293],
         [-3.1080,  1.5369,  1.9338,  ...,  3.1487, -1.8609,  1.8219]]],
       grad_fn=<ViewBackward0>)

In [98]:
mha_result.shape

torch.Size([2, 4, 512])

In [99]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.1):
        super().__init__()
        self.w1 = nn.Linear(d_model, d_ff)
        self.w2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, attn_output):
        return self.w2(self.dropout(F.relu(self.w1(attn_output))))



In [100]:
d_model = 512
d_ff = 128
x = mha_result

In [101]:
ff = PositionWiseFeedForward(d_model,d_ff)
ffn_result = ff(x)

In [102]:
ffn_result.shape

torch.Size([2, 4, 512])

In [103]:
ones = nn.Parameter(torch.ones(1))
ones.shape

torch.Size([1])

In [117]:
class LayerNorm(nn.Module):
    def __init__(self, d_k, eps = 1e-6):
        super().__init__()
        self.eps = eps
        self.a1 = nn.Parameter(torch.ones(d_k))
        self.b1 = nn.Parameter(torch.zeros(d_k))
    def forward(self, x):
        print(x)
        print(x.shape)
        x_mean = x.mean(-1,keepdim=True) # word embedding mean
        x_std = x.std(-1,keepdim=True)
        return self.a1*(x-x_mean)/(x_std+self.eps) +self.b1

In [105]:
features = d_model = 512
eps = 1e-6

In [129]:
ln = LayerNorm(d_model,eps)
normalized_r = ln(ffn_result)

tensor([[[-0.0625,  2.7964, -0.2708,  ...,  2.0266, -0.4185, -1.2691],
         [ 0.1676,  1.1957, -1.0197,  ...,  2.2087, -2.2657, -0.3799],
         [-0.0221,  2.9585, -0.3438,  ...,  2.5006, -1.4301, -1.5331],
         [ 1.7183,  1.5818, -0.3684,  ...,  2.1696, -1.7292, -0.9320]],

        [[ 0.4740,  2.0729,  0.1291,  ...,  0.5363, -1.0441,  0.7311],
         [ 0.2172,  2.1022,  0.7545,  ...,  0.9775, -0.4765,  1.1576],
         [-0.2690,  2.3228,  0.6269,  ...,  0.3169, -1.1312,  1.7287],
         [ 0.4810,  0.2195,  1.0199,  ...,  0.0603, -0.7871,  2.0044]]],
       grad_fn=<ViewBackward0>)
torch.Size([2, 4, 512])


In [107]:
normalized_r.shape

torch.Size([2, 4, 512])

In [144]:
class SubLayerConnection(nn.Module):
    def __init__(self, d_model, dropout = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p = dropout)
        self.d_model = d_model
        self.norm = LayerNorm(d_model)
    def forward(self,x,sublayer):
        return x + self.dropout(sublayer(self.norm(x)))
    
    

In [109]:
x = Variable(torch.LongTensor([[1,2,3,4],[0,6,7,8]]))
d_model =512
vocab_size = 10000
embs = Embeddings(d_model,vocab_size= vocab_size)
inputs_emb = embs(x)
query = key = value = inputs_emb

In [110]:
size = d_model = 512
head = 8
dropout = 0.2
mask = Variable(torch.zeros(8,4,4))
self_attn = MultiHeadAttention(head, d_model)

In [111]:
sublayer = lambda x:self_attn(x,x,x, mask)
sc = SubLayerConnection(d_model, dropout)
sc_result = sc(sublayer, inputs_emb)
sc_result.shape

torch.Size([2, 4, 512])

In [145]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, selfattn, feedforward, dropout):
        super().__init__()
        self.d_model = d_model
        self.attn = selfattn
        self.ff = feedforward
        self.sublayer = clones(SubLayerConnection(d_model, dropout),2)
        
    def forward(self, x, mask):
        # first sublayer 1 that contains multihead attn
        # second sublayer 2 that contains ffn
        print(x)
        print(x.shape)
        x = self.sublayer[0](x, lambda x: self.attn(x,x,x,mask))
        return self.sublayer[1](x, self.ff)
    
    
        
    

In [159]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len = 5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0,max_len).unsqueeze(1) # shape maxlen x 1
        div_term = torch.exp(torch.arange(0,d_model,2) * -(math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(position * div_term)
        pe[:,1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self,x):
        x += Variable(self.pe[:,:x.size(1)], requires_grad= False)
        return self.dropout(x)
# x = torch.tensor([1,2,3,4])
# print(x.shape)
# y = torch.unsqueeze(x,0)
# print(y.shape)
# z =  torch.unsqueeze(x,1)
# print(z.shape)

In [160]:
x = Variable(torch.LongTensor([[1,2,3,4],[0,6,7,8]]))
size = d_model = 512
dropout = 0.1
max_len = 60
vocab_size = 10000
embr = Embeddings(d_model,vocab_size= vocab_size)
x = embr(x)
x.shape

#query = key = value = inputs_emb

torch.Size([2, 4, 512])

In [161]:
pe = PositionalEncoding(d_model, dropout=dropout, max_len=max_len)
pe_result = pe(x)
pe_result.shape

torch.Size([2, 4, 512])

In [162]:
pe_result

tensor([[[ 25.1559, -12.0865,  -5.5447,  ..., -30.4957,  66.4291, -38.6705],
         [ 29.9158,  -0.0000, -46.5394,  ..., -17.3306,  31.7749,   7.0996],
         [ 24.5923, -14.4718, -24.1787,  ..., -46.4053,  51.8896, -46.5875],
         [ 39.6145, -21.5813, -26.3686,  ...,   1.3935,  -7.3859,  28.4846]],

        [[-29.2657, -28.9459,  48.3231,  ...,   4.6213,  18.8426, -38.9961],
         [ -0.0000,  -0.0000,  -6.2620,  ..., -49.2851,   5.8389, -19.7380],
         [-24.6691,  25.3550, -13.5024,  ...,  20.6753,  50.4802,  -8.0211],
         [ -8.8985,  -4.7571,  23.8176,  ...,  -2.5692, -16.4691,  17.5743]]],
       grad_fn=<MulBackward0>)

In [142]:
size = d_model = 512
num_head = 8
d_ff = 128
query = key = value = inputs_emb = x
dropout = 0.2
self_attn = MultiHeadAttention(num_head,d_model)
ff = PositionWiseFeedForward(d_model, d_ff, dropout)
mask = Variable(torch.zeros(8,4,4))

In [164]:
pe_result.shape

torch.Size([2, 4, 512])

In [165]:
enc_layer = EncoderLayer(d_model, self_attn, ff, dropout)
enc_res = enc_layer(pe_result, mask)
print(enc_res)
print(enc_res.shape)

tensor([[[ 25.1559, -12.0865,  -5.5447,  ..., -30.4957,  66.4291, -38.6705],
         [ 29.9158,  -0.0000, -46.5394,  ..., -17.3306,  31.7749,   7.0996],
         [ 24.5923, -14.4718, -24.1787,  ..., -46.4053,  51.8896, -46.5875],
         [ 39.6145, -21.5813, -26.3686,  ...,   1.3935,  -7.3859,  28.4846]],

        [[-29.2657, -28.9459,  48.3231,  ...,   4.6213,  18.8426, -38.9961],
         [ -0.0000,  -0.0000,  -6.2620,  ..., -49.2851,   5.8389, -19.7380],
         [-24.6691,  25.3550, -13.5024,  ...,  20.6753,  50.4802,  -8.0211],
         [ -8.8985,  -4.7571,  23.8176,  ...,  -2.5692, -16.4691,  17.5743]]],
       grad_fn=<MulBackward0>)
torch.Size([2, 4, 512])
tensor([[[ 25.1559, -12.0865,  -5.5447,  ..., -30.4957,  66.4291, -38.6705],
         [ 29.9158,  -0.0000, -46.5394,  ..., -17.3306,  31.7749,   7.0996],
         [ 24.5923, -14.4718, -24.1787,  ..., -46.4053,  51.8896, -46.5875],
         [ 39.6145, -21.5813, -26.3686,  ...,   1.3935,  -7.3859,  28.4846]],

        [[-29.2