In [None]:
import torch 
import torch.nn as nn 
class BilstmEncoder(nn.Module):   #nn is class in pytorch  #use `PascalCase` for class names in Python.
  #class for bidirectional lstm encoder.
  def __init__(self,vocab_size,embedding_dim,hidden_dim,num_layers,dropout,output_dim):      # ??constructor to initialize the parameters
    super(BilstmEncoder,self).__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim)
    self.lstm      = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers,
                            batch_first=True,#!by default, the input shape of lstm is sequence_length,batch_size,embedding_dim, but we want batch_size first as input shape is batch_size,sequence_length
                            bidirectional = True,
                            dropout       = dropout)
    #Pytorch is case sensetive. All module names uses upper case letter.
    #like nn.LSTM,nn.RNN
    #input dimension= embedding_dim
    #both ht and ct are of same size(num_layers*num_directions,batch_size,hidden_dim) 
    #number of direction is 2 for bidirectional lstm 
  
    #number of layers=num_layers is the number of stacked LSTM layers (vertical depth).
    #make sure layers are capitalized while calling, we will not use small letter like nn.LSTM or nn.Linear not nn.linear 
    self.dropout=nn.Dropout(dropout)
    self.fc=nn.Linear(hidden_dim *2,output_dim)
  def forward(self,x): # !size of x is (batch_Size,sequence_length)
    #forward pass where actual calculation/computation happens.
    embedded=self.embedding(x)                               #  self.lstm(embedded) returns a tuple: (output, (h_n, c_n))
    output,(h_n,c_n)=self.lstm(embedded)
    hidden=torch.cat((h_n[-2],h_n[-1]),dim=1) #hidden after concatentaion : (batch_size,hidden_dim *2)
    
    '''  for sequence labeling, we use output as input for next layer
         for sequence classification, we use the concatenated hidden state as input for next layer
         
         1.h_n[-2] → last forward layer

         2.h_n[-1] → last backward layer

         '''
    dropout = self.dropout(hidden) #after dropout, (batch_size,hidden_dim *2)
    out     = self.fc(dropout) #after linear layer, (batch_size,output_dim)
    return out ,hidden                                                                     
    
    


**Summary of Tensor Shapes in the Model:**

- **Input x:** Shape is (batch_size, sequence_length)
- **After Embedding Layer:** Shape is (batch_size, sequence_length, embedding_dim)
- **After LSTM Layer:**
  - Output: Shape is (batch_size, sequence_length, hidden_dim * 2) (since it’s bidirectional)
  - h_n: Shape is (num_layers * 2, batch_size, hidden_dim)
- **After Concatenating Last Layer’s Forward and Backward h_n:** Shape is (batch_size, hidden_dim * 2)
- **After Dropout:** Shape remains (batch_size, hidden_dim * 2)
- **After Linear Layer:** Shape is (batch_size, output_dim)

---




In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AdditiveAttention(nn.Module):
    """
    Implements Bahdanau (additive) attention.
    """
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim, attention_dim):
        super(AdditiveAttention, self).__init__()
        self.encoder_attn = nn.Linear(encoder_hidden_dim, attention_dim) #encoder_attn: shape(encoder_hidden_dim,)
        self.decoder_attn = nn.Linear(decoder_hidden_dim, attention_dim)
        self.v = nn.Parameter(torch.rand(attention_dim))  #self.v: shape (attn_dim,) or (attn_dim, 1)

    def forward(self, encoder_outputs, decoder_hidden):
        # encoder_outputs: (batch, src_len, encoder_hidden_dim)
        # decoder_hidden: (batch, decoder_hidden_dim)
        batch_size, src_len, _ = encoder_outputs.size()
        #get the batch size and source length from encoder outputs
        #- means we ignore the third dimension
        #batch_size is the number of samples in one batch (say 64 sentences)
        #src_len is the length of the source sequence (say 50 words)
        
        # Repeat decoder hidden state src_len times
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        #repeat the decoder hidden state src_len times to match the source length
        #unsqueeze(1) adds a new dimension at index 1, making it  batch_size, 1, decoder_hidden_dim
        
        # Calculate energy
        energy = torch.tanh(self.encoder_attn(encoder_outputs) + self.decoder_attn(decoder_hidden)) # (batch, src_len, attn_dim)
        #!(batch, src_len, attn_dim) @ (attn_dim,) = batch,src_len 
        energy = energy @ self.v    # (batch, src_len)
        
        attn_weights = F.softmax(energy, dim=1) # (batch, src_len)
        
        # Compute context vector
        context              = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1) # (batch, encoder_hidden_dim)
        #dimension of attn_weights      = batch, src_len,
        #          #dimension of encoder_output = batch, src_len, encoder_hidden_dim
        context=F.softmax(context,dim=-1)
        return context, attn_weights

In [5]:
import torch
import torch.nn as nn
class KvAttention(nn.Module): 
  def __init__(self,encoder_hidden_dim,decoder_hidden_dim,attention_dim,):
    super(KvAttention,self).__init__()
    self.encoder_attn = nn.Linear(encoder_hidden_dim,attention_dim) #linear layer to make similar dimension of encoder hidden size and decoder hidden size
    
    self.decoder_attn = nn.Linear(decoder_hidden_dim,attention_dim)
    self.v            = nn.Parameter(torch.rand(attention_dim))
  def forward(self,encoder_outputs,decoder_hidden): 
      #encoder_outputs                            : (``batch,src_len,encoder_hidden_dim``)
      #decoder_hidden                             : shape(`batch_size ,decoder_hidden_dim`)
    batch_size,src_len, _ = encoder_outputs.size() #get the batch size and source length from encoder outputs
    #- means we ignore the third dimension
    #batch_size is the number of samples in one batch (say 64 sentences)
    #src_len is the length of the source sequence (say 50 words)
    decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1,src_len,1).repeat((1,src_len,1))
    #repeat the decoder hidden state src_len times to match the source length
    #unsqueeze(1) adds a new dimension at index 1, making it  batch_size, 1, decoder_hidden_dim
    
    
    
    
    
    
    

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DotProductAttention(nn.Module): 
    def   __init__(self, hidden_dim)    : 
        super(DotProductAttention, self).__init__()
        # Trainable skip connection (combine context and decoder_hidden)
        self.skip_fc = nn.Linear(hidden_dim * 2, hidden_dim) #this line definesa linear trainable layer to combine context vector and decoder hidden state. 

    def forward(self, encoder_outputs, decoder_hidden): 
        """
        encoder_outputs: (batch, src_len, hidden_dim)
        decoder_hidden : (batch, hidden_dim)
        """
        # Unsqueeze decoder_hidden for bmm
        decoder_hidden_exp = decoder_hidden.unsqueeze(1)  # (batch, 1, hidden_dim)

        # Compute dot product attention scores
        attn_scores = torch.bmm(decoder_hidden_exp, encoder_outputs.transpose(1, 2))  # (batch, 1, src_len)

        # Softmax over last dimension (src_lenSWW) to get attention weights
        attn_weights = F.softmax(attn_scores, dim=-1)  # (batch, 1, src_len)

        # Compute context vector as weighted sum of encoder outputs
        context = torch.bmm(attn_weights, encoder_outputs)  # (batch, 1, hidden_dim)
        context = context.squeeze(1)  # (batch, hidden_dim)

        # Optional: return context instead of output if you only need attention output
        return output, attn_weights.squeeze(1)  # (batch, hidden_dim), (batch, src_len)


: 

: 

: 

: 

In [None]:
#practice attention code

import torch
import torch.nn as nn
class KvAtten(nn.Moudule): 
  def __init__(self,):
    super(KvAtten,self).__init__()
    self.encoder_attn = nn.Linear(encoder_hidden_dim,attention_dim) #this will convert the encoder hidden dimenstion into size of attention_dim
    self.decoder_attn = nn.Linear(decoder_hidden_dim,attention_dim)  #this will convert the decoder hiddden dimension into size of attention_dim
    self.v            = nn.Parameter(torch.rand(attention_dim))    #this is a learnable parameter
  def forward(self,encoder_outputs,decoder_hidden): 
    """
    encoder_outputs: (batch,src_len,encoder_hidden_dim)
     decoder_hidden:output )batch size, decoder_hidden_dim"""
     batch_size,src_len,        _ = encoder_outputs.size()
     decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1,src_len,1)   # B * S* H 
     attention_scores=torch.tanh(self.encoder_attn(encoder_outputs) +self.decoder_attn(decoder_hidden)) 
     
     
     
    

: 

: 

: 

: 

In [None]:
import torch
import torch.nn.functional as F

# Step 0: Assume context vector from attention
# context shape: (batch, 1, hidden_dim)
context = context.squeeze(1)  # (batch, hidden_dim)

# Step 1: External memory (e.g., 1000 entries of hidden_dim size)
# You can learn this, or compute from external documents using a BERT encoder
external_memory = torch.randn(1000, hidden_dim)  # (memory_size, hidden_dim)

# Step 2: Normalize (optional but good for cosine-like similarity)
context_norm = F.normalize(context, p=2, dim=-1)            # (batch, hidden_dim)
memory_norm = F.normalize(external_memory, p=2, dim=-1)     # (memory_size, hidden_dim)

# Step 3: Compute similarity (dot product)
# Result shape: (batch, memory_size)
similarity_scores = torch.matmul(context_norm, memory_norm.T)

# Step 4: Retrieve Top-k similar entries
topk_vals, topk_indices = torch.topk(similarity_scores, k=5, dim=-1)

# Step 5 (optional): Retrieve memory entries
retrieved_memory = external_memory[topk_indices]  # shape: (batch, k, hidden_dim)


: 

: 

: 

: 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LuongAttentionLSTMDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers=1, dropout=0.1):
        super(LuongAttentionLSTMDecoder, self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)

        # Luong attention
        self.attn = nn.Linear(hidden_dim, hidden_dim)

        # Combine attention context with LSTM output
        self.concat = nn.Linear(hidden_dim * 2, hidden_dim)

        # Final output layer
        self.fc_out = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden, cell, encoder_outputs):
        """
        input_token: [batch_size]
        hidden: [num_layers, batch_size, hidden_dim]
        cell: [num_layers, batch_size, hidden_dim]
        encoder_outputs: [batch_size, src_len, hidden_dim]
        """

        batch_size = input_token.size(0)

        # Embed input token
        embedded = self.dropout(self.embedding(input_token))  # [batch_size, emb_dim]
        embedded = embedded.unsqueeze(1)  # [batch_size, 1, emb_dim]

        # LSTM forward
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))  # output: [B, 1, H]

        # Compute attention weights
        attn_energy = torch.bmm(self.attn(output), encoder_outputs.transpose(1, 2))  # [B, 1, src_len]
        attn_weights = F.softmax(attn_energy, dim=-1)  # [B, 1, src_len]

        # Compute context vector
        context = torch.bmm(attn_weights, encoder_outputs)  # [B, 1, H]

        # Combine LSTM output and context vector
        rnn_context = torch.cat((output, context), dim=2)  # [B, 1, 2H]
        concat_output = torch.tanh(self.concat(rnn_context))  # [B, 1, H]

        # Predict output token
        prediction = self.fc_out(concat_output.squeeze(1))  # [B, output_dim]

        return prediction, hidden, cell, attn_weights
#decoder code

: 

: 

: 

: 

In [26]:
# assignment 1
class AttentionKv(nn.Module): 
  def   __init__(self,encoder_hidden_dim,decoder_hidden_dim,attention_dim): 
    super(AttentionKv,self).__init__()
    self.encoder_attn = nn.Linear(encoder_hidden_dim,attention_dim)
    self.decoder_attn = nn.Linear(decoder_hidden_dim,attention_dim)
    self.v            = nn.Parameter(torch.rand(attention_dim))
  def forward(self,encoder_outputs,decoder_hidden): 
    batch_size,src_len, _ = encoder_outputs.size()
    decoder_hidden=decoder_hidden.unsqueeze(1).repeat(1,src_len,1)
    attention_scores=torch.tanh(self.encoder_attn(encoder_outputs)+self.decoder_attn(decoder_hidden))
    attention_scores=attention_scores @self.v
    attn_weights=F.softmax(attention_scores,dim=1)
    return  attn_weights  
  

In [27]:
batch_size         = 2
src_len            = 3
encoder_hidden_dim = 4
decoder_hidden_dim = 4
attention_dim      = 2

In [30]:
#define size
batch_size               = 3
encoder_hidden_dim = 60
decoder_hidden_dim = 60
attention_dim      = 30
src_len=100

In [35]:
#sample data
encoder_outputs = torch.randn(batch_size,src_len,encoder_hidden_dim)
decoder_hidden  = torch.randn(batch_size,decoder_hidden_dim)
encoder_outputs
decoder_hidden

tensor([[-0.1982, -0.5656, -0.0344, -0.4023, -1.0217,  0.6845, -1.0943, -0.1195,
          0.2003,  0.6870, -0.3685, -0.2558,  1.8566, -1.0952,  1.5763,  1.2893,
         -0.7791,  0.8882, -1.9549, -0.3953,  0.4467, -0.2103, -0.3323, -0.8470,
         -1.5153, -0.0453,  1.5179,  0.9221, -0.1601, -1.2935,  1.5567, -0.1089,
         -0.5952, -1.1361,  0.6588,  0.6821, -1.0666, -0.8660, -0.9400, -1.2721,
         -1.4257, -1.0858,  0.2832, -0.0877,  0.5734, -0.0793,  0.0876, -0.4869,
         -1.3830,  0.2053,  0.9705, -0.2412,  0.1817,  0.8301, -1.7060, -0.5976,
         -0.1194, -1.3846, -0.3416,  0.1388],
        [-1.9080, -1.7609,  0.8781, -0.7118,  1.1466,  2.2397,  0.6078, -1.3261,
         -0.7604,  0.1885, -0.9655,  1.2914,  0.7544, -0.2592,  2.5983, -0.8389,
          0.4845,  0.3244,  1.2643,  0.6301, -0.3412,  0.3711,  0.2503, -0.9494,
         -0.6602,  1.0392, -0.2214,  0.5862,  0.8444,  0.9143, -0.6322,  0.0389,
          1.1701, -0.6585,  1.4844, -0.2719,  0.0957, -0.7449, 

In [36]:
model = AttentionKv(encoder_hidden_dim,decoder_hidden_dim,attention_dim)
model

AttentionKv(
  (encoder_attn): Linear(in_features=60, out_features=30, bias=True)
  (decoder_attn): Linear(in_features=60, out_features=30, bias=True)
)

In [37]:
attn_weights = model(encoder_outputs,decoder_hidden)
attn_weights 

tensor([[0.0023, 0.0051, 0.0007, 0.0009, 0.0070, 0.0055, 0.0025, 0.0032, 0.0009,
         0.0010, 0.0053, 0.0064, 0.0015, 0.0081, 0.0011, 0.0350, 0.0134, 0.0128,
         0.0056, 0.0014, 0.0002, 0.0038, 0.0065, 0.0211, 0.0027, 0.0023, 0.0351,
         0.0149, 0.0019, 0.0076, 0.0012, 0.0185, 0.0021, 0.0080, 0.0056, 0.0436,
         0.0101, 0.0002, 0.0420, 0.0069, 0.0031, 0.0044, 0.0081, 0.0144, 0.0016,
         0.0377, 0.0046, 0.0081, 0.0005, 0.0015, 0.0022, 0.0093, 0.0075, 0.0196,
         0.0004, 0.0080, 0.0136, 0.0080, 0.0002, 0.0010, 0.0031, 0.0027, 0.0056,
         0.0007, 0.0422, 0.0060, 0.0372, 0.0054, 0.0022, 0.0960, 0.0045, 0.0036,
         0.0008, 0.0067, 0.0272, 0.0107, 0.0079, 0.0012, 0.0007, 0.0011, 0.0779,
         0.0011, 0.0725, 0.0065, 0.0029, 0.0038, 0.0112, 0.0031, 0.0063, 0.0045,
         0.0029, 0.0011, 0.0021, 0.0046, 0.0043, 0.0017, 0.0097, 0.0022, 0.0035,
         0.0050],
        [0.0038, 0.0907, 0.0198, 0.0041, 0.0003, 0.0002, 0.0023, 0.0056, 0.0347,
         0

In [None]:
#Assignment 2
