In [2]:

import torch
import os
import torch.nn as nn

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self,hidden_size,embedding,n_layers=1,dropout=0):
        super(EncoderRNN,self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        # because our input size is a word embedding ith number of gfeatures == hodeen_size
        
        self.gru = nn.GRU(hidden_size,hidden_size,n_layers,dropout=(0 if
                                                                   n_layers==1
                                                                   else dropout),birectional=True)
    def forward(self,input_seq,input_lengths,hidden=None):
        # input_seq: batch of input_sentences,shape=max_length,batch_size)
        # input_length = list of sentence lengths corresponding to each sentence in the batch
        # hidden_state of shape : n_layersxnum_directions,batch_size,hidden_size
        # convert word indexes to embeddings
        
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_packed_sequence(embedded,input_lengths)
        # Forward pass through GRU
        outputs,hidden = self.gru(packed,hidden)
        # unpack padding
        outputs,_ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum birectional GRU outputs
        outputs = outputs[:,:,:self.hidden_size] + outputs[:,:,self.hidden_size:]
        # Return output and final hidden state
        return outputs,hidden
    
        # outputs: the output features h_t from the last layer of the GRU, for each timestep(sum of bidirectional outputs)
        # outputs shape = (max_lenght,batch_size,hidden_size)
        # hidden: hidden state for the last timestep, of shape=(n_layers x num_deirections, batch_size,hidden_size)
        
        

In [None]:
### Luong attention layer
class Attn(torch.nn.Modulr):
    def __init__(self,method,hidden_size):
        super(Attn,self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        
    def dot_score(self,hidden,encoder_output):
        # Element wise Multiply the current target decoder state with the encoder output and sum them
        return torch.sum(hidden*encoder_output,dim=2)
    
    def forward(self,hidden,encoder_outputs):
        # hidden of shape: (1,batch_size,hidden_size)
        # encoder_outputs of shape: (max_length,batch_size,hidden_size)
        # (1,batch_size,hidden_size) * (max_length,batch_size,hidden_size)= (max_lenght,batch_sioze,hidden_size)
        
        # calculate the attention weights (energies)
        attn_energies = self.dot_score(hidden,encoder_outputs) # (max_len,batch_size)
        # Transpose max_length and batch_size dimensions
        att_energies = attn_energies.t() # (batch_size,max_length)
        # Return the softmax normalized probability scores(with added dimensions)
        return F.softmax(attn_energies,dim=1).unsqueeze(1) # (batch_size,1,max_length)
    
    
    