In [1]:
import numpy as np
import pandas as pd
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)

NumPy version: 1.23.5
Pandas version: 2.3.1


In [2]:
print("hellow")


hellow


In [3]:
import torch

In [4]:
torch.cuda.is_available()

False

In [5]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [6]:
import torch

if torch.cuda.is_available(): 
    print(f"CUDA is available. GPU count: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. No GPU detected or accessible.")



CUDA is not available. No GPU detected or accessible.


In [7]:
import torch
import torch.nn as nn
from typing import Tuple

class BiLSTMEncoder(nn.Module): # nn is a class in pytorch that provides base class for all neural network. [# !naming conv=PascalCase]
    """Bidirectional LSTM encoder for sequence-to-sequence models.

    Encodes input sequences into hidden states and outputs, suitable for tasks like machine translation.
    Uses a bidirectional LSTM to capture context from both directions, followed by a linear layer for output projection.

    Args:
        vocab_size (int): Size of the input vocabulary.
        embedding_dim (int): Dimension of token embeddings.
        hidden_dim (int): Dimension of LSTM hidden states per direction.
        num_layers (int): Number of LSTM layers.
        dropout (float): Dropout probability for regularization.
        output_dim (int): Dimension of the output (e.g., target vocabulary size for classification).

    Attributes:
        embedding (nn.Embedding): Token embedding layer.
        lstm (nn.LSTM): Bidirectional LSTM layer.
        dropout (nn.Dropout): Dropout layer.
        fc (nn.Linear): Linear layer to project concatenated hidden states.
    """
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, #constsructor to initialize layrs and args . 
                 num_layers: int, dropout: float, output_dim: int):
        super(BiLSTMEncoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.output_dim = output_dim
        self.dropout_rate = dropout

        # Initialize layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True, #input shape (batch_size,sequence_length, embedding_dim) by default shape of lstm is sequence_length first.)
            bidirectional=True, 
            dropout=dropout if num_layers > 1 else 0.0  # Dropout only if multiple layers
        )
        #Pytorch is case sensetive. All module names uses upper case letter.
        #like nn.LSTM,nn.RNN
        #input dimension= embedding_dim
        #both ht and ct are of same size(num_layers*num_directions,batch_size,hidden_dim) 
        #number of direction is 2 for bidirectional lstm 
  
        #number of layers=num_layers is the number of stacked LSTM layers (vertical depth).
        #make sure layers are capitalized while calling, we will not use small letter like nn.LSTM or nn.Linear not nn.linear 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  #as we are uing bidirectional LSTM, hidden_dim *2 

        # Initialize weights for stability  
        nn.init.xavier_uniform_(self.embedding.weight)  #xavier initialization for embedding.
        for name, param in self.lstm.named_parameters(): #xavier initialization for LSTM weights
            #name parameters means all the parameters of LSTM gates.
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.zeros_(param)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Performs the forward pass of the encoder(real calculation happens).

        Args:
            x (torch.Tensor): Input token IDs, shape (batch_size, sequence_length).

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
                - output: LSTM outputs, shape (batch_size, sequence_length, hidden_dim * 2).
                - hidden: Final hidden states, shape (num_layers * 2, batch_size, hidden_dim).
                - cell: Final cell states, shape (num_layers * 2, batch_size, hidden_dim).

        Raises:
            ValueError: If input tensor shape or dimensions are invalid.
        """
        # Validate input
        if x.dim() != 2:
            raise ValueError(f"Expected input shape (batch_size, sequence_length), got {x.shape}")
        if not torch.all(x >= 0) or not torch.all(x < self.vocab_size):
            raise ValueError(f"Input token IDs must be in [0, {self.vocab_size}), got min {x.min()}, max {x.max()}")

        # Embed input: (batch_size, sequence_length) -> (batch_size, sequence_length, embedding_dim)
        embedded = self.embedding(x)

        # Apply LSTM: (batch_size, sequence_length, embedding_dim) -> 
        # (batch_size, sequence_length, hidden_dim * 2), (num_layers * 2, batch_size, hidden_dim)
        output, (hidden, cell) = self.lstm(embedded)

        # Concatenate final forward and backward hidden states: (batch_size, hidden_dim * 2)
        final_hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)

        # Apply dropout and linear layer: (batch_size, hidden_dim * 2) -> (batch_size, output_dim)
        dropout = self.dropout(final_hidden)
        out = self.fc(dropout)

        return out, hidden, cell

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple

class AdditiveAttention(nn.Module):
    """Implements Bahdanau-style additive attention for sequence-to-sequence models.

    Computes attention scores between the decoder's hidden state and encoder outputs,
    producing a context vector and attention weights for use in decoding.

    Args:
        encoder_hidden_dim (int): Dimension of the encoder's hidden states.
        decoder_hidden_dim (int): Dimension of the decoder's hidden states.
        attention_dim (int): Dimension of the attention mechanism's hidden layer.

    Attributes:
        encoder_attn (nn.Linear): Linear layer to project encoder outputs.
        decoder_attn (nn.Linear): Linear layer to project decoder hidden state.
        v (nn.Parameter): Parameter vector to compute attention scores.
    """
    def __init__(self, encoder_hidden_dim: int, decoder_hidden_dim: int, attention_dim: int):
        super(AdditiveAttention, self).__init__()
        self.encoder_hidden_dim = encoder_hidden_dim
        self.decoder_hidden_dim = decoder_hidden_dim
        self.attention_dim = attention_dim

        # Linear layers to project encoder and decoder states
        self.encoder_attn = nn.Linear(encoder_hidden_dim, attention_dim, bias=False)
        self.decoder_attn = nn.Linear(decoder_hidden_dim, attention_dim, bias=False)
        
        # Attention score parameter, initialized with Glorot initialization for stability
        self.v = nn.Parameter(torch.empty(attention_dim))
        nn.init.xavier_uniform_(self.v.unsqueeze(0))  # Shape: (1, attention_dim)

    def forward(self, encoder_outputs: torch.Tensor, decoder_hidden: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """Computes the attention context vector and weights.

        Args:
            encoder_outputs (torch.Tensor): Encoder hidden states, shape (batch_size, src_len, encoder_hidden_dim).
            decoder_hidden (torch.Tensor): Decoder hidden state, shape (batch_size, decoder_hidden_dim).

        Returns:
            Tuple[torch.Tensor, torch.Tensor]:
                - context: Context vector, shape (batch_size, encoder_hidden_dim).
                - attn_weights: Attention weights, shape (batch_size, src_len).

        Raises:
            ValueError: If input tensor shapes or dimensions do not match expected values.
        """
        # Validate input shapes
        if encoder_outputs.dim() != 3:
            raise ValueError(
                f"Expected encoder_outputs to be 3D (batch_size, src_len, encoder_hidden_dim), got {encoder_outputs.shape}"
            )
        if decoder_hidden.dim() != 2:
            raise ValueError(
                f"Expected decoder_hidden to be 2D (batch_size, decoder_hidden_dim), got {decoder_hidden.shape}"
            )

        batch_size, src_len, enc_dim = encoder_outputs.size()
        if enc_dim != self.encoder_hidden_dim:
            raise ValueError(f"Encoder hidden dimension mismatch: expected {self.encoder_hidden_dim}, got {enc_dim}")
        if decoder_hidden.size(1) != self.decoder_hidden_dim:
            raise ValueError(f"Decoder hidden dimension mismatch: expected {self.decoder_hidden_dim}, got {decoder_hidden.size(1)}")
        if batch_size != decoder_hidden.size(0):
            raise ValueError(f"Batch size mismatch: encoder_outputs {batch_size}, decoder_hidden {decoder_hidden.size(0)}")

        # Repeat decoder hidden state to match source length: (batch_size, decoder_hidden_dim) -> (batch_size, src_len, decoder_hidden_dim)
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        # Compute energy: (batch_size, src_len, encoder_hidden_dim) -> (batch_size, src_len, attention_dim)
        #                + (batch_size, src_len, decoder_hidden_dim) -> (batch_size, src_len, attention_dim)
        energy = torch.tanh(self.encoder_attn(encoder_outputs) + self.decoder_attn(decoder_hidden))

        # Compute attention scores: (batch_size, src_len, attention_dim) @ (attention_dim,) -> (batch_size, src_len)
        attention_scores = torch.matmul(energy, self.v)

        # Apply softmax to get attention weights: (batch_size, src_len)
        attn_weights = F.softmax(attention_scores, dim=1)

        # Compute context vector: (batch_size, 1, src_len) @ (batch_size, src_len, encoder_hidden_dim) -> (batch_size, 1, encoder_hidden_dim)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)

        return context, attn_weights

In [9]:
class DecoderWithAttention(nn.Module):
    """Decoder with additive attention for sequence-to-sequence models.

    Processes one token at a time, using attention to focus on relevant encoder outputs.
    Outputs predictions for the next token and updated LSTM states.

    Args:
        output_dim (int): Size of the target vocabulary.
        embed_dim (int): Dimension of token embeddings.
        encoder_hidden_dim (int): Dimension of encoder hidden states.
        decoder_hidden_dim (int): Dimension of decoder hidden states.
        attention_dim (int): Dimension of the attention mechanism's hidden layer.
        dropout (float, optional): Dropout probability. Defaults to 0.1.

    Attributes:
        embedding (nn.Embedding): Token embedding layer.
        attention (AdditiveAttention): Attention mechanism.
        rnn (nn.LSTM): LSTM layer for decoding.
        fc_out (nn.Linear): Output projection layer.
        dropout (nn.Dropout): Dropout layer.
    """
    def __init__(self, output_dim: int, embed_dim: int, encoder_hidden_dim: int, 
                 decoder_hidden_dim: int, attention_dim: int, dropout: float = 0.1):
        super(DecoderWithAttention, self).__init__()
        self.output_dim = output_dim
        self.embed_dim = embed_dim
        self.encoder_hidden_dim = encoder_hidden_dim
        self.decoder_hidden_dim = decoder_hidden_dim
        self.attention_dim = attention_dim

        # Initialize layers
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.attention = AdditiveAttention(encoder_hidden_dim, decoder_hidden_dim, attention_dim)
        self.rnn = nn.LSTM(embed_dim + encoder_hidden_dim, decoder_hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(decoder_hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token: torch.Tensor, decoder_hidden: torch.Tensor, 
                decoder_cell: torch.Tensor, encoder_outputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """Performs one decoding step.

        Args:
            input_token (torch.Tensor): Input token IDs, shape (batch_size, 1).
            decoder_hidden (torch.Tensor): Previous hidden state, shape (1, batch_size, decoder_hidden_dim).
            decoder_cell (torch.Tensor): Previous cell state, shape (1, batch_size, decoder_hidden_dim).
            encoder_outputs (torch.Tensor): Encoder outputs, shape (batch_size, src_len, encoder_hidden_dim).

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
                - prediction: Logits for next token, shape (batch_size, output_dim).
                - hidden: Updated hidden state, shape (1, batch_size, decoder_hidden_dim).
                - cell: Updated cell state, shape (1, batch_size, decoder_hidden_dim).
                - attn_weights: Attention weights, shape (batch_size, src_len).

        Raises:
            ValueError: If input shapes do not match expected dimensions.
        """
        # Validate input shapes
        if input_token.dim() != 2 or input_token.size(1) != 1:
            raise ValueError(f"Expected input_token shape (batch_size, 1), got {input_token.shape}")
        if decoder_hidden.dim() != 3 or decoder_hidden.size(0) != 1:
            raise ValueError(f"Expected decoder_hidden shape (1, batch_size, decoder_hidden_dim), got {decoder_hidden.shape}")
        if decoder_cell.shape != decoder_hidden.shape:
            raise ValueError(f"Expected decoder_cell shape to match decoder_hidden, got {decoder_cell.shape}")
        if encoder_outputs.dim() != 3:
            raise ValueError(f"Expected encoder_outputs shape (batch_size, src_len, encoder_hidden_dim), got {encoder_outputs.shape}")

        batch_size = input_token.size(0)

        # Embed input token: (batch_size, 1) -> (batch_size, 1, embed_dim)
        embedded = self.dropout(self.embedding(input_token))

        # Compute attention: (batch_size, src_len, encoder_hidden_dim), (batch_size, decoder_hidden_dim)
        # -> (batch_size, encoder_hidden_dim), (batch_size, src_len)
        context, attn_weights = self.attention(encoder_outputs, decoder_hidden.squeeze(0))
        context = context.unsqueeze(1)  # (batch_size, 1, encoder_hidden_dim)

        # Concatenate embedding and context: (batch_size, 1, embed_dim + encoder_hidden_dim)
        rnn_input = torch.cat((embedded, context), dim=2)

        # LSTM: (batch_size, 1, embed_dim + encoder_hidden_dim) -> (batch_size, 1, decoder_hidden_dim)
        output, (hidden, cell) = self.rnn(rnn_input, (decoder_hidden, decoder_cell))

        # Predict next token: (batch_size, 1, decoder_hidden_dim) -> (batch_size, output_dim)
        prediction = self.fc_out(output.squeeze(1))

        return prediction, hidden, cell, attn_weights