## Paper Implementation - Attention Is All You Need

In [1]:
%%capture
!pip install portalocker
!pip install datasets

In [2]:
%%capture
!pip install -U spacy
!python -m spacy download en

In [3]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [4]:
import torch
import torch.nn as nn
import math
import copy
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import dataset,DataLoader
from torch import Tensor
from collections import Counter
from torchtext.vocab import Vocab


In [5]:
from datasets import load_dataset
ds = load_dataset("open_subtitles", 'en-hi',split="train")
ds = ds.remove_columns('meta')
ds = ds.train_test_split(test_size=0.1)
train_data = ds['train']
val_data = ds['test']

Downloading builder script:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading and preparing dataset open_subtitles/en-hi (download: 2.83 MiB, generated: 13.20 MiB, post-processed: Unknown size, total: 16.03 MiB) to /root/.cache/huggingface/datasets/open_subtitles/en-hi/2018.0.0/c1ec973ca4b6e588740d8f167cc0e24ea3f626e70bc7ffe467e944730500e198...


Downloading data:   0%|          | 0.00/2.97M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/93016 [00:00<?, ? examples/s]

Dataset open_subtitles downloaded and prepared to /root/.cache/huggingface/datasets/open_subtitles/en-hi/2018.0.0/c1ec973ca4b6e588740d8f167cc0e24ea3f626e70bc7ffe467e944730500e198. Subsequent calls will reuse this data.


In [70]:
ds['train'][:6]

{'id': ['33248', '3701', '73304', '66456', '53347', '88591'],
 'translation': [{'en': '- Yeah.', 'hi': '- हाँ.'},
  {'en': "I'm getting older...", 'hi': 'मैं बढ़ा हो रहा हूं...'},
  {'en': '- Mmm.', 'hi': '- मम्म।'},
  {'en': '- Tom!', 'hi': '- Tom!'},
  {'en': "- Melinda, I'm sorry.", 'hi': '- मेलिंडा, मैं माफी चाहता हूँ.'},
  {'en': 'Of course you will.', 'hi': 'निश्चय ही आप करेंगे।'}]}

In [7]:
en_tokenizer = get_tokenizer('spacy', language='en')



In [8]:
from nltk.tokenize import ToktokTokenizer
sentence = "नमस्ते दुनिया, मैं एक भाषा मॉडल हूँ।"

toktok_tokennizer = ToktokTokenizer()
tokens = toktok_tokennizer.tokenize(sentence)


In [9]:
# en_tokenizer('Spacy model')

In [10]:
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

def yield_tokens_en(data_iter):
    for text in data_iter:
        yield en_tokenizer(text['en'].lower())

# Build the vocabulary from the training dataset
vocab_en = build_vocab_from_iterator(yield_tokens_en(iter(train_data['translation'])),min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)
def yield_tokens_de(data_iter):
    for text in data_iter:
        yield toktok_tokennizer.tokenize(text['hi'])

# Build the vocabulary from the training dataset
vocab_de = build_vocab_from_iterator(yield_tokens_de(iter(train_data['translation'])),min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

vocab_en.set_default_index(UNK_IDX)
vocab_de.set_default_index(UNK_IDX)


In [11]:
len(vocab_de.vocab),len(vocab_en.vocab)

(25413, 19446)

In [12]:
vocab_de.lookup_tokens([20,30]),vocab_en.lookup_tokens([20,30])

(['लिए', 'को'], ['of', 'your'])

In [13]:
len(vocab_de.vocab),len(vocab_en.vocab)

(25413, 19446)

In [14]:
vocab_de['बहुत'.lower()],vocab_en['Well']

(66, 0)

In [15]:
def preprocess_data(data):
    tensor_data = []
    for translation_data in data['translation']:
        en = translation_data['en']
        de = translation_data['hi']
        en_tensor = torch.tensor([vocab_en[v] for v in en_tokenizer(en.lower())],dtype=torch.long)
        de_tensor = torch.tensor([vocab_de[v] for v in toktok_tokennizer.tokenize(de)],dtype=torch.long)
        tensor_data.append([en_tensor,de_tensor])
    return tensor_data

train_data = preprocess_data(train_data)
val_data = preprocess_data(val_data)

In [16]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
def generate_batch(data_batch):
    de_batch, en_batch = [], []
    for en_tensor,de_tensor in data_batch:
        en_tensor = torch.cat([torch.tensor([BOS_IDX]), en_tensor, torch.tensor([EOS_IDX])], dim=0)
        en_batch.append(en_tensor)
        
        de_tensor = torch.cat([torch.tensor([BOS_IDX]), de_tensor, torch.tensor([EOS_IDX])], dim=0)
        de_batch.append(de_tensor)

    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)

    return de_batch, en_batch

BATCH_SIZE = 128
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,shuffle=True, collate_fn=generate_batch)
val_iter = DataLoader(val_data, batch_size=BATCH_SIZE,shuffle=True, collate_fn=generate_batch)

In [17]:
len(train_iter)

655

In [18]:
for de_batch, en_batch in train_iter:
    break
de_batch.T.shape, en_batch.shape

(torch.Size([128, 21]), torch.Size([20, 128]))

In [19]:
for de_batch, en_batch in val_iter:
    break
de_batch.T.shape, en_batch.shape

(torch.Size([128, 29]), torch.Size([26, 128]))

## Embedding layer

In this step first we convert each word to input sequence and generate embedding vector.

In [20]:
class Embedding(nn.Module):
    def __init__(self,vocab_size,dmodel=512) -> None:
        # dmodel -> embedding model dimention
        super(Embedding,self).__init__()
        self.vocab_size = vocab_size
        self.dmodel = dmodel
        self.embed_layer = nn.Embedding(self.vocab_size,self.dmodel)
    def forward(self,x):
        embed_out = self.embed_layer(x)
        # In the embedding layers, we multiply those weights by sqrt(dmodel)  -> pange 5
        return embed_out * math.sqrt(self.dmodel)

## Positional Encoding

In [21]:
class PositionalEncoding(nn.Module):
    def __init__(self,max_seq_len,d_model=512) -> None:
         # dmodel -> embedding model dimention
        super(PositionalEncoding,self).__init__()
        self.d_model = d_model
        pos = torch.arange(0, max_seq_len,dtype = torch.float).unsqueeze(1)
        # we know a^-x  is equals to 1/a^x
        frequency = torch.pow(10000,-torch.arange(0,d_model,2,dtype = torch.float)/self.d_model)
        pe = torch.zeros((max_seq_len,d_model))
        pe[:,0::2] = torch.sin(pos * frequency)
        pe[:,1::2] = torch.cos(pos * frequency)
        self.register_buffer('pe', pe)
    def forward(self,embed_vect):
        pe = self.pe[:embed_vect.size()[1]]
        return embed_vect + pe

## Attention Layer

In [22]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model = 512, n_head = 8, dropout_rate = 0.2) -> None:
        super().__init__()
        self.d_model = d_model
        self.n_head = n_head
        self.dropout = nn.Dropout(p=dropout_rate)
        self.head_dim = int(d_model/n_head)
        self.softmax_layer = nn.Softmax(dim=-1)
        self.w_key = nn.Linear(d_model,d_model)
        self.w_query = nn.Linear(d_model,d_model)
        self.w_value = nn.Linear(d_model,d_model)
        self.output_project = nn.Linear(d_model,d_model)

    def attention(self,key,query,value,mask=None):
        # calculate attenction score
        # query = (BS,NH,S/T,HD) , key.transpose(-2,-1) = (BS,NH,HD,S/T)
        # attention score size for encoder attention = (BS,NH,S,S) , decoder attention = (BS,NH,T,T), encoder-decoder attention = (BS,NH,T,S)
        attenction_score = torch.matmul(query,key.transpose(-2,-1))/math.sqrt(self.head_dim)

        # apply masking
        if mask is not None:
            attenction_score.masked_fill(mask==torch.tensor(False),float("-inf"))

        # pass through softmax layer
        attention_weight = self.softmax_layer(attenction_score)

        # multiply with value
        # Final shape of score = (BS,NH,S/T,HD)
        score = torch.matmul(attention_weight,value)
        return score

    def forward(self,key,query,value,mask=None):
        batch_size = key.size()[0]

        # dot product with weight matrices
        # size of key/query/value = (BS,S/T,ED) ,
        # where BS = batch size,
        # S = Source Sequence length,
        # T = target sequence lenth,
        # ED = Embedding dimension,
        # NH = Number Of Head
        # HD = head dimension
        key, query, value = self.w_key(key), self.w_query(query), self.w_value(value)

        # split vector by number of head and transpose
        # size of key/query/value = (BS,NH,S/T,HD) , where BS = batch size, NH = Number Of Head, ED = Head dimension
        key = key.view(batch_size,-1,self.n_head,self.head_dim).transpose(1, 2)
        query = query.view(batch_size,-1,self.n_head,self.head_dim).transpose(1, 2)
        value = value.view(batch_size,-1,self.n_head,self.head_dim).transpose(1, 2)

        # size of attention_score = (BS,NH,S/T,HD)
        attention_score = self.attention(key,query,value,mask) # size - torch.Size([2, 4, 8, 64]) -> [batch_size, max_seq_len,n_head, head_dim]
        attention_score = self.dropout(attention_score)
        # concatenated output
        attention_score = attention_score.transpose(1,2) # size = (BS,S/T,NH,HD)
        attention_score = attention_score.reshape(batch_size,-1,self.head_dim*self.n_head) # size = (BS,S/T,ED)

        # Pass through linear layer
        attention_out = self.output_project(attention_score)
        return attention_out

def get_clone(module,num_copy):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(num_copy)])

## Position-wise Feed-Forward Networks

In [23]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self,d_model=512, dropout_rate = 0.2) -> None:
        super().__init__()
        self.d_model = d_model
        hidden_width = 4
        self.dropout = nn.Dropout(p=dropout_rate)
        self.linear1 = nn.Linear(d_model,d_model*hidden_width)
        self.linear2 = nn.Linear(d_model*hidden_width, d_model)
        self.relu = nn.ReLU()
    def forward(self,x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


## SubLayer

In [24]:
class SubLayer(nn.Module):
    def __init__(self,d_model = 512) -> None:
        super(SubLayer,self).__init__()
        self.norm = nn.LayerNorm(d_model)
    def forward(self,x,sub_layer_x):
        return self.norm(x + sub_layer_x)


## Encoder Layer

Encoder Layer has two sub layer. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network.

In [25]:
class EnocderLayer(nn.Module):
    def __init__(self,d_model,multi_head_arttention_layer,position_wise_feedforward_layer, dropout_rate = 0.2) -> None:
        super().__init__()
        self.d_model = d_model

        self.multi_head_arttention_layer = multi_head_arttention_layer
        self.dropout1 = nn.Dropout(p=dropout_rate)
        self.sublayer1 = SubLayer(d_model)

        self.position_wise_feedforward_layer = position_wise_feedforward_layer
        self.dropout2 = nn.Dropout(p=dropout_rate)
        self.sublayer2 = SubLayer(d_model)
    def forward(self,vec_representation,src_mask=None):
        # compute self attention
        attention_score =self.multi_head_arttention_layer(key = vec_representation,query = vec_representation,value = vec_representation,mask = src_mask)
        attention_score = self.dropout1(attention_score)
        # Layer Norm
        attention_out = self.sublayer1(vec_representation,attention_score)

        # pass Position Wise Feedforward Network
        position_wise_feedforward_out = self.position_wise_feedforward_layer(attention_out)
        position_wise_feedforward_out = self.dropout2(position_wise_feedforward_out)
        # Layer Norm
        encoder_out = self.sublayer2(attention_out,position_wise_feedforward_out)
        return encoder_out

## Encoder

In [26]:
class EncoderBlock(nn.Module):
    def __init__(self,encoder_layer, num_layer = 6) -> None:
        super().__init__()
        self.encoder_layer = encoder_layer
        self.encoder_layer_list = get_clone(self.encoder_layer,num_layer)

    def forward(self,src_embedding,src_mask=None):
        encoder_out = src_embedding
        for encoder_layer in self.encoder_layer_list:
            encoder_out = encoder_layer(encoder_out,src_mask)
        return encoder_out


## Decoder Layer

In [27]:
class DnocderLayer(nn.Module):
    def __init__(self,d_model,multi_head_arttention_layer,position_wise_feedforward_layer,dropout_rate = 0.2) -> None:
        super().__init__()
        self.d_model = d_model
        self.decoder_attention_layer = copy.deepcopy(multi_head_arttention_layer)
        self.dropout1 = nn.Dropout(p=dropout_rate)
        self.sublayer1 = SubLayer(d_model)

        self.encoder_decoder_attention_layer = copy.deepcopy(multi_head_arttention_layer)
        self.dropout2 = nn.Dropout(p=dropout_rate)
        self.sublayer2 = SubLayer(d_model)

        self.position_wise_feedforward_layer = position_wise_feedforward_layer
        self.dropout3 = nn.Dropout(p=dropout_rate)
        self.sublayer3 = SubLayer(d_model)

    def forward(self,enc,dec,src_mask=None,target_mask=None):
        decoder_attention_out = self.decoder_attention_layer(key = dec,query = dec,value = dec,mask = target_mask)
        decoder_attention_out = self.dropout1(decoder_attention_out)
        decoder_attention_out = self.sublayer1(dec,decoder_attention_out)

        enc_dec_attention_out = self.encoder_decoder_attention_layer(key = enc,query = decoder_attention_out,value = enc,mask = src_mask)
        enc_dec_attention_out = self.dropout2(enc_dec_attention_out)
        enc_dec_attention_out = self.sublayer2(decoder_attention_out,enc_dec_attention_out)

        ffn_out = self.position_wise_feedforward_layer(enc_dec_attention_out)
        ffn_out = self.dropout2(ffn_out)
        ffn_out = self.sublayer2(enc_dec_attention_out,ffn_out)

        return ffn_out

## Decoder Block

In [28]:
class DecoderBlock(nn.Module):
    def __init__(self,decoder_layer,num_layer = 6) -> None:
        super().__init__()
        self.decoder_layer = decoder_layer
        self.decoder_layer_list = get_clone(self.decoder_layer,num_layer)
        self.layer_norm = nn.LayerNorm(self.decoder_layer.d_model)

    def forward(self,encoder_out_vec,decoder_embedding,src_mask=None,target_mask=None):
        dec_out = decoder_embedding
        for decoder_layer in self.decoder_layer_list:
            dec_out = decoder_layer(enc = encoder_out_vec,dec = dec_out,src_mask = src_mask,target_mask = target_mask)
        return dec_out

In [29]:
class DecoderGenerator(nn.Module):
    def __init__(self,d_model,target_vocab_size) -> None:
        super().__init__()
        self.linear = nn.Linear(d_model,target_vocab_size)
        self.softmax_layer = nn.LogSoftmax(dim=-1)
    def forward(self,target_vec_rep):
        return self.softmax_layer(self.linear(target_vec_rep))

## Tranformers Block

In [30]:
class Transformers(nn.Module):
    def __init__(self,src_seq_len,trg_seq_len,d_model,num_head,dropout_rate = 0.2) -> None:
        super().__init__()
        self.src_seq_len = src_seq_len
        self.trg_seq_len = trg_seq_len
        self.d_model = d_model
        self.num_head = num_head

        self.src_embedding = Embedding(self.src_seq_len,self.d_model)
        self.src_pe = PositionalEncoding(self.src_seq_len,self.d_model)

        self.trg_embedding = Embedding(self.trg_seq_len,self.d_model)
        self.trg_pe = PositionalEncoding(self.trg_seq_len,self.d_model)

        self.multi_head_attention = MultiHeadAttention(d_model,num_head,dropout_rate)
        self.position_wise_feedforward = PositionWiseFeedForward(self.d_model,dropout_rate)

        self.encoder_layer = EnocderLayer(d_model,self.multi_head_attention,self.position_wise_feedforward,dropout_rate)
        self.decoder_layer = DnocderLayer(d_model,self.multi_head_attention,self.position_wise_feedforward,dropout_rate)

        self.encoder_block = EncoderBlock(self.encoder_layer,num_layer=6)
        self.decoder_block = DecoderBlock(self.decoder_layer,num_layer=6)
        self.decoder_out_gen = DecoderGenerator(d_model,self.trg_seq_len)

    def forward(self,src_token_id,target_token_id,src_mask=None,target_mask=None):
        encode_out = self.encode(src_token_id,src_mask)
        decode_out = self.decode(encode_out,target_token_id,src_mask,target_mask)
        return decode_out

    def encode(self,src_token_id,src_mask):
        embed = self.src_embedding(src_token_id)
        pe_out = self.src_pe(embed)
        encoder_out = self.encoder_block(pe_out,src_mask)
        return encoder_out

    def decode(self,enc_out,trg_token_ids,src_mask=None,tagrget_mask=None):
        embed = self.src_embedding(trg_token_ids)
        pe_out = self.src_pe(embed)
        decoder_out = self.decoder_block(enc_out,pe_out,src_mask,tagrget_mask)
        decoder_out = self.decoder_out_gen(decoder_out)
        return decoder_out


In [84]:
def get_src_mask(src_token_ids_batch,pad_tok_id):
    batch_size = src_token_ids_batch.size()[0]
    src_mask = (src_token_ids_batch!=pad_tok_id).view(batch_size, 1, 1,-1) #SIZE = (BS,1,1,S)
    return src_mask
def get_trg_mask(trg_token_ids_batch,pad_tok_id):
    batch_size = trg_token_ids_batch.size()[0]
    seq_len = trg_token_ids_batch.size()[1]
    trg_pad_mask = (trg_token_ids_batch!=pad_tok_id).view(batch_size, 1, 1,-1) #SIZE = (BS,1,1,T)
    trg_look_forward = torch.triu(torch.ones((1,1,seq_len,seq_len),dtype=torch.int16)).transpose(2,3).type(torch.bool)

    trg_mask = trg_pad_mask & trg_look_forward
    return trg_mask

In [85]:
#de_batch, en_batch
src_voab_size = max(len(vocab_de.vocab),len(vocab_en.vocab))#30000#len(vocab_de.vocab)+10
trg_vocab_size = max(len(vocab_de.vocab),len(vocab_en.vocab))#30000#len(vocab_en.vocab)+10
model = Transformers(
    src_seq_len = src_voab_size,
    trg_seq_len = trg_vocab_size,
    d_model = 512,
    num_head = 8,
    dropout_rate = 0.05
)

In [86]:
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [87]:
len(vocab_de.vocab),len(vocab_en.vocab)

(25413, 19446)

In [88]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 88,456,005 trainable parameters


In [89]:
import warnings
warnings.filterwarnings("ignore")

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = 'cpu'
model = model.to(DEVICE)
loss_fn = torch.nn.CrossEntropyLoss()#ignore_index=PAD_IDX

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
def train(model,optimizer,train_iter):#src_batch,trg_batch
    epoch_loss = 0
    model.train()
    for ind, (src_batch,trg_batch) in enumerate(train_iter):
        src_batch = src_batch.T
        trg_batch = trg_batch.T
        trg_token_ids_batch_input = trg_batch[:,:-1]

        src_mask = get_src_mask(src_batch,PAD_IDX)
        trg_mask = get_trg_mask(trg_token_ids_batch_input,PAD_IDX)
        try:
            optimizer.zero_grad()
            output = model(src_batch.to(DEVICE), trg_token_ids_batch_input.to(DEVICE),src_mask.to(DEVICE),trg_mask.to(DEVICE))

            output_reshape = output.contiguous().view(-1, output.shape[-1])
            trg = trg_batch[:, 1:].contiguous().view(-1)

            loss = loss_fn(output_reshape, trg.to(DEVICE))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

            optimizer.step()
            epoch_loss += loss.item()
            if ind%100==0:
                print('step :', round((ind / len(train_iter)) * 100, 2), '% , loss :', loss.item())
        except:
            print(src_batch.size(), trg_token_ids_batch_input.size())
    return epoch_loss / len(train_iter)

def evaluate(model,val_iter):
    model.eval()
    losses = 0
    for ind, (src_batch,trg_batch) in enumerate(val_iter):
        src_batch = src_batch.T
        trg_batch = trg_batch.T
        trg_token_ids_batch_input = trg_batch[:,:-1]

        src_mask = get_src_mask(src_batch,PAD_IDX)
        trg_mask = get_trg_mask(trg_token_ids_batch_input,PAD_IDX)
        output = model(src_batch.to(DEVICE), trg_token_ids_batch_input.to(DEVICE),src_mask.to(DEVICE),trg_mask.to(DEVICE))

        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg_batch[:, 1:].contiguous().view(-1)

        loss = loss_fn(output_reshape, trg.to(DEVICE))
        losses += loss.item()
    return losses/len(val_iter)
train_loss = []
val_loss = []
for ep in range(10):
    ep_loss = train(model,optimizer,train_iter)
    v_loss = evaluate(model,val_iter)
    print('-----------------',ep_loss,'---------',v_loss,'----')

step : 0.0 % , loss : 10.137059211730957
step : 15.27 % , loss : 2.527031660079956
step : 30.53 % , loss : 2.348515033721924
step : 45.8 % , loss : 1.375995397567749
step : 61.07 % , loss : 1.2328836917877197
step : 76.34 % , loss : 1.0846625566482544
step : 91.6 % , loss : 1.056598424911499
----------------- 2.0718758845147285 --------- 1.364669770410616 ----
step : 0.0 % , loss : 1.385873794555664
step : 15.27 % , loss : 1.823289394378662
step : 30.53 % , loss : 1.1141082048416138
step : 45.8 % , loss : 1.1335809230804443
step : 61.07 % , loss : 0.9676639437675476
step : 76.34 % , loss : 0.6266230940818787
step : 91.6 % , loss : 0.8079640865325928
----------------- 0.9954308292792954 --------- 0.653301819546582 ----
step : 0.0 % , loss : 0.7268496751785278
step : 15.27 % , loss : 0.6736799478530884
step : 30.53 % , loss : 0.6073668599128723
step : 45.8 % , loss : 0.3806842863559723
step : 61.07 % , loss : 0.38324010372161865
step : 76.34 % , loss : 0.4289551079273224
step : 91.6 % , 

In [None]:
predicted_log_distributions = baseline_transformer(src_token_ids_batch, trg_token_ids_batch_input, src_mask, trg_mask)


In [90]:
model.eval()
losses = 0
for ind, (src_batch,trg_batch) in enumerate(val_iter):
    src_batch = src_batch.T
    trg_batch = trg_batch.T
    trg_token_ids_batch_input = trg_batch[:,:-1]

    src_mask = get_src_mask(src_batch,PAD_IDX)
    trg_mask = get_trg_mask(trg_token_ids_batch_input,PAD_IDX)
    output = model(src_batch.to(DEVICE), trg_token_ids_batch_input.to(DEVICE),src_mask.to(DEVICE),trg_mask.to(DEVICE))

    output_reshape = output.contiguous().view(-1, output.shape[-1])
    trg = trg_batch[:, 1:].contiguous().view(-1)

    loss = loss_fn(output_reshape, trg.to(DEVICE))
    losses += loss.item()
    break

In [91]:
trg.shape,output_reshape.shape

(torch.Size([6784]), torch.Size([6784, 25413]))

In [92]:
_,tok_id = torch.max(output_reshape,dim=1)
tok_id

tensor([95,  5, 29,  ...,  1,  1,  1], device='cuda:0')

In [93]:
tok_id[:26],trg_token_ids_batch_input[0]

(tensor([  95,    5,   29,  194,   40,   12, 2304,    4,    3,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1], device='cuda:0'),
 tensor([   2,   95,    5,   29,  194,   40,   12, 2304,    4,    3,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1]))

In [61]:
trg_token_ids_batch_input.size(),output.size(),output_reshape.size(),trg.size()

(torch.Size([128, 25]),
 torch.Size([128, 25, 25413]),
 torch.Size([3200, 25413]),
 torch.Size([3200]))

In [78]:
torch.save(model, 'transformer.pt')

In [62]:
vocab_en['questions']

1031

In [190]:
text = 'Well, perhaps we could just talk for a couple of minutes'
en_t = torch.tensor([vocab_en[v] for v in toktok_tokennizer.tokenize(text)],dtype=torch.long)
src_ten = torch.cat([torch.tensor([BOS_IDX]), en_t, torch.tensor([EOS_IDX])], dim=0).view(1, -1)
num_tokens = src_ten.shape[0]
src_mask = get_src_mask(src_ten,PAD_IDX)
encode_out = model.encode(src_ten.to(DEVICE),src_mask.to(DEVICE))

In [191]:
# UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX

trg_st = torch.ones(1,1).fill_(BOS_IDX).type(torch.long)
trg_msk = get_trg_mask(trg_st,PAD_IDX)
trg_msk
out = model.decode(encode_out.to(DEVICE),trg_st.to(DEVICE),src_mask.to(DEVICE),trg_msk.to(DEVICE))

In [195]:
out.min()

tensor(-17.6853, device='cuda:0', grad_fn=<MinBackward1>)

In [103]:
print(out.size())
_, next_word = torch.max(out, dim = 2)
next_word.item()


torch.Size([1, 1, 25512])


5

In [107]:
torch.max(torch.tensor([1,26,3]), dim = 0)

torch.return_types.max(
values=tensor(26),
indices=tensor(1))

In [96]:
vocab_de.lookup_token(5)

'है'

In [104]:
vocab_de

3

In [95]:
zz=0
def greedy_decode(model, src_tensor, max_len, start_symbol):
    src_mask = get_src_mask(src_tensor,PAD_IDX)
    encode_out = model.encode(src_tensor.to(DEVICE),src_mask.to(DEVICE))
#     print('encode_out',encode_out)
    trg_tensor = torch.ones(1,1).fill_(BOS_IDX).type(torch.long)
    global zz
    ind_=0
    for _ in range(max_len):
        trg_mask = get_trg_mask(trg_tensor,PAD_IDX)
        print('all trg',trg_mask.size(),trg_tensor.size(),trg_tensor,trg_mask)
        out = model.decode(encode_out.to(DEVICE),trg_tensor.to(DEVICE),src_mask.to(DEVICE),trg_mask.to(DEVICE))
        print('out',out.size())
        if ind_==0:
            zz = out
        ind_+=1
        _, next_word_id = torch.max(out[:,-1,:], dim = 1)
        next_word_id = next_word_id.item()
        trg_tensor = torch.cat([trg_tensor,torch.ones(1,1).fill_(next_word_id)], dim=-1).type(torch.long)
        print(trg_tensor)
    print(trg_tensor)
    return trg_tensor
        
def translation(text,model):
    model.eval()
    en_t = torch.tensor([vocab_en[v] for v in toktok_tokennizer.tokenize(text)],dtype=torch.long)
    src_tensor = torch.cat([torch.tensor([BOS_IDX]), en_t, torch.tensor([EOS_IDX]),
                           torch.tensor([PAD_IDX]),
                           torch.tensor([PAD_IDX])], dim=0).view(1, -1)
    max_len = 5
    print('src tensor',src_tensor)
    trg_tensor = greedy_decode(model, src_tensor, max_len, BOS_IDX)
    return trg_tensor
text = 'hi'.lower()
z = translation(text,model)

src tensor tensor([[  2, 330,   3,   1,   1]])
all trg torch.Size([1, 1, 1, 1]) torch.Size([1, 1]) tensor([[2]]) tensor([[[[True]]]])
out torch.Size([1, 1, 25413])
tensor([[2, 3]])
all trg torch.Size([1, 1, 2, 2]) torch.Size([1, 2]) tensor([[2, 3]]) tensor([[[[ True, False],
          [ True,  True]]]])
out torch.Size([1, 2, 25413])
tensor([[2, 3, 1]])
all trg torch.Size([1, 1, 3, 3]) torch.Size([1, 3]) tensor([[2, 3, 1]]) tensor([[[[ True, False, False],
          [ True,  True, False],
          [ True,  True, False]]]])
out torch.Size([1, 3, 25413])
tensor([[2, 3, 1, 1]])
all trg torch.Size([1, 1, 4, 4]) torch.Size([1, 4]) tensor([[2, 3, 1, 1]]) tensor([[[[ True, False, False, False],
          [ True,  True, False, False],
          [ True,  True, False, False],
          [ True,  True, False, False]]]])
out torch.Size([1, 4, 25413])
tensor([[2, 3, 1, 1, 1]])
all trg torch.Size([1, 1, 5, 5]) torch.Size([1, 5]) tensor([[2, 3, 1, 1, 1]]) tensor([[[[ True, False, False, False, False],

In [82]:
torch.argsort(zz)

tensor([[[ 859,  435, 1078,  ..., 1739, 1545,    3]]], device='cuda:0')

In [83]:
vocab_de.lookup_tokens([1545])

['देखता']

In [None]:
num_of_trg_tokens = len(target_sentences_tokens[0])
predicted_log_distributions = predicted_log_distributions[num_of_trg_tokens-1::num_of_trg_tokens]

In [143]:
p = torch.rand([1, 1, 25512])
p

tensor([[[0.0123, 0.9426, 0.9559,  ..., 0.7820, 0.7218, 0.6256]]])

In [145]:
torch.max(p[:,-1,:],dim=1)

torch.return_types.max(
values=tensor([0.9998]),
indices=tensor([8649]))

In [120]:
torch.rand([1, 2, 25512])[1-1::num_of_trg_tokens]
 torch.tensor([[trg_field_processor.vocab.stoi[tokens[0]]] for tokens in target_sentences_tokens], device=device)

tensor([[[0.4540, 0.3935, 0.0110,  ..., 0.8049, 0.8889, 0.5214],
         [0.7958, 0.9438, 0.6087,  ..., 0.8676, 0.5913, 0.4120]]])

In [117]:
out.shape

torch.Size([1, 1, 25512])

In [None]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")


In [None]:
src_sentence = 'उस लड़की की ऑक्सीजन मास्क उतारो।'
src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)


In [None]:
vocab_de['उस']

In [105]:
model

Transformers(
  (src_embedding): Embedding(
    (embed_layer): Embedding(25512, 512)
  )
  (src_pe): PositionalEncoding()
  (trg_embedding): Embedding(
    (embed_layer): Embedding(25512, 512)
  )
  (trg_pe): PositionalEncoding()
  (multi_head_attention): MultiHeadAttention(
    (dropout): Dropout(p=0.2, inplace=False)
    (softmax_layer): Softmax(dim=-1)
    (w_key): Linear(in_features=512, out_features=512, bias=True)
    (w_query): Linear(in_features=512, out_features=512, bias=True)
    (w_value): Linear(in_features=512, out_features=512, bias=True)
    (output_project): Linear(in_features=512, out_features=512, bias=True)
  )
  (position_wise_feedforward): PositionWiseFeedForward(
    (dropout): Dropout(p=0.2, inplace=False)
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (relu): ReLU()
  )
  (encoder_layer): EnocderLayer(
    (multi_head_arttention_layer): MultiHeadAttention(
      (drop