In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class LuongAttention(nn.Module):

    def __init__(self, encoder_dim, decoder_dim):
        super(LuongAttention,self).__init__()

        self.linear_encoder = nn.Linear(encoder_dim,decoder_dim)


    def forward(self, encoder_outputs, decoder_hidden):

        # encoder_outputs: [batch, seq_len, enc_dim]
        # decoder_hidden: [batch, dec_dim]

        # Project encoder outputs

        proj_enc = self.linear_encoder(encoder_outputs) # [B, T, dec_dim]

        # Unsqueeze decoder hidden to [B, dec_dim, 1]

        dec_hidden = decoder_hidden.unsqueeze(2) # [B, dec_dim, 1]

        # Compute dot-product scores

        scores = torch.bmm(proj_enc, dec_hidden).squeeze(2) # [B, T]


        attn_weights = F.softmax(scores, dim=1) # [B, T]

        context_vector = torch.bmm(attn_weights.unsqueeze(1),encoder_outputs).squeeze(1) # [B, enc_dim]

        return context_vector, attn_weights

1. Encoder

In [3]:
class Encoder(nn.Module):

    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers=1):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True)

    def forward(self, src):

        # src: [batch_size, src_len]

        embedded = self.embedding(src) # [batch_size, src_len, emb_dim]

        outputs, (hidden, cell) = self.rnn(embedded)

        # outputs: [batch_size, src_len, hidden_dim]

        return outputs, hidden, cell

2. Luong Attention

In [4]:
class LuongAttention(nn.Module):

    def __init__(self, hidden_dim):
        super().__init__()

        self.attn = nn.Linear(hidden_dim, hidden_dim)

    
    def forward(self , decoder_hidden , encoder_outputs):

        # decoder_hidden: [batch_size, 1, hidden_dim]
        # encoder_outputs: [batch_size, src_len, hidden_dim]

        # → scores: [batch_size, 1, src_len]

        score = torch.bmm(decoder_hidden, encoder_outputs.transpose(1,2))

        attn_weights = F.softmax(score, dim=1)

        context = torch.bmm(attn_weights, encoder_outputs)  # [batch_size, 1, hidden_dim]

        return context, attn_weights

3. Decoder with Luong Attention

In [5]:
class Decoder(nn.Module):

    def __init__(self, output_dim, emb_dim, hidden_dim, attention, num_layers=1):
        super().__init__()

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True)

        self.attention = attention

        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, input, hidden, cell, encoder_outputs):

        # input: [batch_size] → unsqueeze to [batch_size, 1]

        input = input.unsqueeze(1)

        embedded = self.embedding(input)  # [batch_size, 1, emb_dim]

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))  # output: [batch, 1, hidden]

        context, attn_weights = self.attention(output, encoder_outputs) # [batch, 1, hidden]

        concat = torch.cat((output, context), dim=2) # [batch, 1, hidden*2]

        prediction = self.fc_out(concat.squeeze(1))  # [batch, output_dim]

        return prediction, hidden, cell, attn_weights

4. Seq2Seq Model

In [6]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder):
        super().__init__()

        self.encoder = encoder

        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):

        batch_size = trg.size(0)

        trg_len = trg.size(1)

        trg_vocab_size = self.decoder.fc_out.out_features


        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(src.device)

        encoder_outputs , hidden, cell = self.encoder(src)

        input = trg[:, 0] # first input to the decoder is <sos>


        for t in range(1, trg_len):

            output, hidden, cell, _ = self.decoder(input, hidden, cell, encoder_outputs)

            outputs[:, t] = output

            top1 = output.argmax(1)

            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs

1. Toy Dataset (for sequence-to-sequence translation)

Input  (English):     ['i', 'like', 'deep', 'learning']
Target (French-like): ['j', 'aime', 'apprendre', 'profond']


2. Full Training Code with Sample Dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Toy vocabulary
SRC_vocab = ['<pad>', '<sos>', '<eos>', 'i', 'like', 'deep', 'learning']
TRG_vocab = ['<pad>', '<sos>', '<eos>', 'j', 'aime', 'apprendre', 'profond']

SRC_word2idx = {w: i for i, w in enumerate(SRC_vocab)}
TRG_word2idx = {w: i for i, w in enumerate(TRG_vocab)}
SRC_idx2word = {i: w for w, i in SRC_word2idx.items()}
TRG_idx2word = {i: w for w, i in TRG_word2idx.items()}

# Sample data (tokenized)
src_sentences = [['i', 'like', 'deep', 'learning']]
trg_sentences = [['<sos>', 'j', 'aime', 'apprendre', 'profond', '<eos>']]

# Convert to indices
def encode(sent, word2idx):
    return [word2idx[word] for word in sent]

src_tensor = torch.tensor([encode(sent, SRC_word2idx) for sent in src_sentences])
trg_tensor = torch.tensor([encode(sent, TRG_word2idx) for sent in trg_sentences])


 3. Model Setup and Training Loop

# Model parameters
INPUT_DIM = len(SRC_vocab)
OUTPUT_DIM = len(TRG_vocab)
EMB_DIM = 16
HIDDEN_DIM = 32
NUM_LAYERS = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Build the model
attn = LuongAttention(HIDDEN_DIM)
enc = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM, NUM_LAYERS)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, attn, NUM_LAYERS)
model = Seq2Seq(enc, dec).to(device)

# Optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=TRG_word2idx['<pad>'])

# Move tensors to device
src_tensor, trg_tensor = src_tensor.to(device), trg_tensor.to(device)


In [None]:
# Training loop
EPOCHS = 300
for epoch in range(1, EPOCHS + 1):
    model.train()
    optimizer.zero_grad()

    output = model(src_tensor, trg_tensor)
    # output: [batch, trg_len, output_dim]
    output_dim = output.shape[-1]
    
    # Flatten the output and target for loss computation
    output = output[:, 1:].reshape(-1, output_dim)
    trg = trg_tensor[:, 1:].reshape(-1)

    loss = criterion(output, trg)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}/{EPOCHS}, Loss: {loss.item():.4f}")


 4. Inference Function

In [None]:
def translate_sentence(model, sentence):
    model.eval()
    with torch.no_grad():
        input_ids = torch.tensor([[SRC_word2idx[word] for word in sentence]]).to(device)
        encoder_outputs, hidden, cell = model.encoder(input_ids)

        input_token = torch.tensor([TRG_word2idx['<sos>']]).to(device)
        translated = []

        for _ in range(10):
            output, hidden, cell, _ = model.decoder(input_token, hidden, cell, encoder_outputs)
            top1 = output.argmax(1).item()
            if top1 == TRG_word2idx['<eos>']:
                break
            translated.append(TRG_idx2word[top1])
            input_token = torch.tensor([top1]).to(device)
    
    return translated


In [None]:
test_sentence = ['i', 'like', 'deep', 'learning']
print("Input:", test_sentence)
print("Output:", translate_sentence(model, test_sentence))
