## Paper Implementation - Attention Is All You Need

<img src='https://pytorch.org/tutorials/_images/transformer_architecture.jpg'>'

In [1]:
import torch
import torch.nn as nn
import math

## Embedding layer

In [3]:
class Embedding(nn.Module):
    def __init__(self,vocab_size,dmodel=512) -> None:
        # dmodel -> embedding model dimention
        super(Embedding,self).__init__()
        self.vocab_size = vocab_size
        self.dmodel = dmodel
        self.embed_layer = nn.Embedding(self.vocab_size,self.embedding_dim)
    def forward(self,x):
        embed_out = self.embed_layer(x)
        # In the embedding layers, we multiply those weights by sqrt(dmodel)  -> pange 5
        return embed_out * torch.sqrtself.dmodel(self.dmodel)

## Positional Encoding

In this step we generate positional encoding. Then we add embedding output and positional encoding.

<p align="center"><img height=300 src='http://jalammar.github.io/images/t/transformer_positional_encoding_vectors.png'></p>

In `Attention Is All You Need` paper auther two positional encoding function. Use sine function for even time steps and cosine function for odd time steps.

<p align="center"><img src='image/pe.png'></p>

Here `pos` is position of the token in sentence.<br>
`i` is the position of the dimension<br>
$d_{model}$ is the embedding dimension<br>

Hare Embedding layer output dimension and Positional Encoding layer output dimension same

Reference 
- https://kazemnejad.com/blog/transformer_architecture_positional_encoding/


In [20]:
class PositionalEncoding(nn.Module):
    def __init__(self,max_seq_len,d_model=512) -> None:
         # dmodel -> embedding model dimention
        super(PositionalEncoding,self).__init__()
        self.d_model = d_model
        pos = torch.arange(0, max_seq_len,dtype = torch.float).unsqueeze(1)
        # we know a^-x  is equals to 1/a^x
        frequency = torch.pow(10000,-torch.arange(0,d_model,2,dtype = torch.float)/self.d_model)
        pe = torch.zeros((max_seq_len,d_model))
        pe[:,0::2] = torch.sin(pos * frequency)
        pe[:,1::2] = torch.cos(pos * frequency)
        self.register_buffer('pe', pe)
    def forward(self,embed_vect):
        return embed_vect + self.pe

In [21]:

max_len = 100
d_model = 512
pe_en = PositionalEncoding(max_len,d_model)


## Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model = 512,n_head = 8) -> None:
        super().__init__()
        

In [7]:
max_len = 100
d_model = 512
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(max_len, 1, d_model)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)

torch.Size([100, 256])

https://ai.stackexchange.com/questions/41670/why-use-exponential-and-log-in-positional-encoding-of-transformer