<a href="https://colab.research.google.com/github/yvrjsharma/Transformers/blob/main/Transformers_Attention_is_all_you_need.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy torchtext



In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy 
from torch.autograd import Variable

#For Plots
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
class EncoderDecoder(nn.Module):
  """ A standard encoder-decoder architecture """
  def __init__(self, encoder, decoder,src_embed, tgt_embed, generator):
    super(EncoderDecoder, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_embed = src_embed
    self.tgt_embed = tgt_embed
    self.generator = generator

  def forward(self, src, tgt, src_mask, tgt_mask):
    #take in and process masked spource and target sequences
    return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

  def encode(self, src, src_mask):
    return self.encoder(self.src_embed(src), src_mask)

  def decode(self,memory, src_mask, tgt, tgt_mask):
    return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)


In [4]:
class Generator(nn.Module):
  #Defining standard linear and softmax generation step
  def __init__(self, d_model, vocab):
    super(Generator, self).__init__()
    self.proj = nn.Linear(d_model, vocab)
  
  def forward(self,x):
    return F.log_softmax(self.proj(x),dim=-1) 

## Encoder - Decoder Stack 
### Encoder consists of 6 identical layers

In [6]:
def clones(module, N=6):
  "Creates N identical layers of the type"
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [7]:
class Encoder(nn.Module):
  #Coding for encoder block
  def __init__(self,layer, N):
    super(Encoder, self).__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self,x, mask):
    "Pass input and mask through each layer by turn"
    for layer in self.layers:
      x =  layer(x, mask)
    return self.norm(x)

Applying residual layer and normalization layers

In [8]:
class LayerNorm(nn.Module):
  def __init__(self,features, eps=1e-6):
    super(LayerNorm, self).__init__()
    self.a_2 = nn.Parameter(torch.ones(features))
    self.b_2 = nn.Parameter(torch.zeros(features))
    self.eps=eps

  def forward(self,x):
    mean= x.mean(-1, keepdim=True)
    std= x.std(-1,keepdim=True)
    return self.a_2*(x-mean)/(std+self.eps) +self.b_2