In [85]:
import torch # for core funtionality , gives easy simple structure for NN , uses tensors as DATA STRUCTURES 
import torch.nn as nn # for neural networks
import torch.optim # for training netwrokssss
import math  # maths ops 
import copy # copying complex objects 

In [86]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
        
        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)
        
        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

In summary, the MultiHeadAttention class encapsulates the multi-head attention mechanism commonly used in transformer models. It takes care of splitting the input into multiple attention heads, applying attention to each head, and then combining the results. By doing so, the model can capture various relationships in the input data at different scales, improving the expressive ability of the model.

In [87]:
#position wise feed forward network

class PositonWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff): #d_ff for dimension of feed forwrd layers
        super(PositonWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu= nn.ReLU() # RELU ACTIVATION FUNCTION FOR NON L;INEARITY
    def forward(self,x): # x for input to feed forward
        return self.fc2(self.relu(self.fc1(x))) # firstly passed through first linear layer with input x and then passed through RELU activation function and then passed through 2nd linear layer

## Encoding 
is a broader term for transforming data into a numerical representation, often for basic processing, while 
## Embedding 
is a more sophisticated technique that maps data into a vector space, capturing semantic relationships and similarities between data points. 

In [88]:
# positional encoding for injecting position information of each token in the input sequence using sine and cosine funtions to generate positional encoding

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe= torch.zeros(max_seq_length, d_model) # A tensor filled with zeros, which will be populated with positional encodings
        position= torch.arange(0,max_seq_length, dtype=torch.float).unsqueeze(1) #: A tensor containing the position indices for each position in the sequence.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)) # used to scale the position indices in a specific way.

        pe[:, 0::2]= torch.sin(position * div_term)
        pe[:, 1::2]= torch.cos(position * div_term)

        self.register_buffer('pe',pe.unsqueeze(0)) #he sine function is applied to the even indices and the cosine function to the odd indices of pe.
#Finally, pe is registered as a buffer, which means it will be part of the module's state but will not be considered a trainable parameter.

    def forward(self, x):
        return x + self.pe[:, :x.size(1)] #uses the first x.size(1) elements of pe to ensure that the positional encodings match the actual sequence length of x.

## The PositionalEncoding 
class adds information about the position of tokens within the sequence. Since the transformer model lacks inherent knowledge of the order of tokens (due to its self-attention mechanism), this class helps the model to consider the position of tokens in the sequence. The sinusoidal functions used are chosen to allow the model to easily learn to attend to relative positions, as they produce a unique and smooth encoding for each position in the sequence.

In [89]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn= MultiHeadAttention( d_model, num_heads)
        self.feed_forward= PositonWiseFeedForward(d_model, d_ff)
        self.norm1= nn.LayerNorm(d_model) # layer normalisation to smooth the input
        self.norm2= nn. LayerNorm(d_model)
        self.dropout= nn.Dropout(dropout) #Dropout layer, used to prevent overfitting by randomly setting some activations to zero during training.

    def forward(self, x, mask):
        attn_output= self.self_attn(x,x,x, mask) # Optional mask to ignore certain parts of the input.
        x= self.norm1(x + self.dropout(attn_output))   
        ff_output= self.feed_forward(x)
        x= self.norm2(x + self.dropout(ff_output))
        return x

## The EncoderLayer 
class defines a single layer of the transformer's encoder. It encapsulates a multi-head self-attention mechanism followed by the position-wise feed-forward neural network, with residual connections, layer normalization, and dropout applied as appropriate. Together, these components allow the encoder to capture complex relationships in the input data and transform them into a useful representation for downstream tasks. Typically, multiple such encoder layers are stacked to form the complete encoder part of a transformer model.

In [90]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout): # dropout for regularization
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads) #Multi-head self-attention mechanism for the target sequence.
        self.cross_attn = MultiHeadAttention(d_model, num_heads) # Multi-head attention mechanism that attends to the encoder's output.
        self.feed_forward = PositonWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask): # x is input to the decoder layer
        attn_output = self.self_attn(x, x, x, tgt_mask)    
        x= self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

enc_output: The output from the corresponding encoder (used in the cross-attention step).
src_mask: Source mask to ignore certain parts of the encoder's output.
tgt_mask: Target mask to ignore certain parts of the decoder's input.

### The DecoderLayer 
class defines a single layer of the transformer's decoder. It consists of a multi-head self-attention mechanism, a multi-head cross-attention mechanism (that attends to the encoder's output), a position-wise feed-forward neural network, and the corresponding residual connections, layer normalization, and dropout layers. This combination enables the decoder to generate meaningful outputs based on the encoder's representations, taking into account both the target sequence and the source sequence. As with the encoder, multiple decoder layers are typically stacked to form the complete decoder part of a transformer model.

In [91]:
# Combining the encoder and decoder layers to create the complete Transformer network


class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length,  dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model) # Embedding layer for the source sequenc
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model) # Embedding layer for the target sequence.
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size) # final fully connected linear layer mapping to the target vocab
        self.dropout = nn.Dropout(dropout)

#This method is used to create masks for the source and target sequences, ensuring that padding tokens are ignored and that future tokens are not visible during training for the target sequence.
    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask
    
    def forward(self, src, tgt): # taking source and target sequences and producing the output predictions.
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded # The source sequence is passed through the encoder layers, with the final encoder output representing the processed source sequence.
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded # The target sequence and the encoder's output are passed through the decoder layers, resulting in the decoder's output.
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

            output = self.fc(dec_output) # final fully connected output 
            return output        


### the Transformer
 class brings together the various components of a Transformer model, including the embeddings, positional encoding, encoder layers, and decoder layers. It provides a convenient interface for training and inference, encapsulating the complexities of multi-head attention, feed-forward networks, and layer normalization.

This implementation follows the standard Transformer architecture, making it suitable for sequence-to-sequence tasks like machine translation, text summarization, etc. Including masking ensures that the model adheres to the causal dependencies within sequences, ignoring padding tokens and preventing information leakage from future tokens.

These sequential steps empower the Transformer model to efficiently process input sequences and produce corresponding output sequences.

### why ignore padding tokens ?
1st ans--> The model must learn to ignore `<pad>` tokens, which can add complexity to its training, as it must differentiate between two types of non-content-bearing tokens: one that signifies the end of content and one that is purely for maintaining consistent input sizes



### Embedding means
Embeddings are dense numeric vectors that capture semantic meaning for each token. They bring discrete language into a continuous vector space that transformers can analyze. The embeddings serve as the mathematical language and data representation that transformers operate on.

In [92]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

### Hyperparameter	Typical values	Impact on performance
d_model	256, 512, 1024	Higher values increase model capacity but require more computation

num_heads	8, 12, 16	More heads can capture diverse aspects of data, but are computationally intensive

num_layers	6, 12, 24	More layers improve representation power, but can lead to overfitting

d_ff	2048, 4096	Larger feed-forward networks increase model robustness

dropout	0.1, 0.3	Regularizes the model to prevent overfitting

learning rate	0.0001 - 0.001	Impacts convergence speed and stability

batch size	32, 64, 128	Larger batch sizes improve learning stability but require more memory

In [93]:
import torch.optim as optim  # ✅ Add this line

criterion = nn.CrossEntropyLoss(ignore_index=0)  # Defines the loss function as cross-entropy loss. The ignore_index argument is set to 0, meaning the loss will not consider targets with an index of 0 (typically reserved for padding tokens).
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9) # Defines the optimizer as Adam with a learning rate of 0.0001 and specific beta values.

transformer.train()

for epoch in range(100): # iterate opver 100 epochs
    optimizer.zero_grad() #  Clears the gradients from the previous iteration.
    output = transformer(src_data, tgt_data[:, :-1]) # Passes the source data and the target data (excluding the last token in each sequence) through the transformer. This is common in sequence-to-sequence tasks where the target is shifted by one token.
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1)) # Computes the loss between the model's predictions and the target data (excluding the first token in each sequence). The loss is calculated by reshaping the data into one-dimensional tensors and using the cross-entropy loss function.
    loss.backward() #Computes the gradients of the loss with respect to the model's parameters.
    optimizer.step() #Updates the model's parameters using the computed gradients.
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}") # Prints the current epoch number and the loss value for that epoch.

Epoch: 1, Loss: 8.674612045288086
Epoch: 2, Loss: 8.612031936645508
Epoch: 3, Loss: 8.556350708007812
Epoch: 4, Loss: 8.508870124816895
Epoch: 5, Loss: 8.462989807128906
Epoch: 6, Loss: 8.427068710327148
Epoch: 7, Loss: 8.38417911529541
Epoch: 8, Loss: 8.34548282623291
Epoch: 9, Loss: 8.307433128356934
Epoch: 10, Loss: 8.275105476379395
Epoch: 11, Loss: 8.23989200592041
Epoch: 12, Loss: 8.201516151428223
Epoch: 13, Loss: 8.165194511413574
Epoch: 14, Loss: 8.131704330444336
Epoch: 15, Loss: 8.088874816894531
Epoch: 16, Loss: 8.056140899658203
Epoch: 17, Loss: 8.014360427856445
Epoch: 18, Loss: 7.971382141113281
Epoch: 19, Loss: 7.929688930511475
Epoch: 20, Loss: 7.900956153869629
Epoch: 21, Loss: 7.857017993927002
Epoch: 22, Loss: 7.815174579620361
Epoch: 23, Loss: 7.776070594787598
Epoch: 24, Loss: 7.732661247253418
Epoch: 25, Loss: 7.693478584289551
Epoch: 26, Loss: 7.6540446281433105
Epoch: 27, Loss: 7.608273983001709
Epoch: 28, Loss: 7.566090106964111
Epoch: 29, Loss: 7.523313999176

### a loss function
 is a crucial component that measures how well the model's predictions match the actual target values during training

### an "optimizer" 
refers to an algorithm that adjusts the model's parameters during training to minimize the difference between its predictions and the actual target values

### gradient
refers to the gradient of the loss function with respect to the model's parameters, which are the weights and biases within the neural network. These gradients guide the optimization process during training, helping the model adjust its parameters to minimize the difference between its predictions and the actual target values. 

This code snippet trains the transformer model on randomly generated source and target sequences for 100 epochs. It uses the Adam optimizer and the cross-entropy loss function. The loss is printed for each epoch, allowing you to monitor the training progress. In a real-world scenario, you would replace the random source and target sequences with actual data from your task, such as machine translation.

In [94]:
transformer.eval() # Puts the transformer model in evaluation mode. This is important because it turns off certain behaviors like dropout that are only used during training.

# Generate random sample validation data
val_src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
val_tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

with torch.no_grad():

    val_output = transformer(val_src_data, val_tgt_data[:, :-1])
    val_loss = criterion(val_output.contiguous().view(-1, tgt_vocab_size), val_tgt_data[:, 1:].contiguous().view(-1))
    print(f"Validation Loss: {val_loss.item()}")

Validation Loss: 8.830320358276367


### Generate random validation data:

## val_src_data: 
Random integers between 1 and src_vocab_size, representing a batch of validation source sequences with shape (64, max_seq_length).
## val_tgt_data:
Random integers between 1 and tgt_vocab_size, representing a batch of validation target sequences with shape (64, max_seq_length).

### Validation loop:

# with torch.no_grad():
 Disables gradient computation, as we don't need to compute gradients during validation. This can reduce memory consumption and speed up computations.
# val_output = transformer(val_src_data, val_tgt_data[:, :-1]): 
Passes the validation source data and the validation target data (excluding the last token in each sequence) through the transformer.
# val_loss = criterion(...):
 Computes the loss between the model's predictions and the validation target data (excluding the first token in each sequence). The loss is calculated by reshaping the data into one-dimensional tensors and using the previously defined cross-entropy loss function.
# print(f"Validation Loss: {val_loss.item()}"):
 Prints the validation loss value.

This code snippet evaluates the transformer model on a randomly generated validation dataset, computes the validation loss, and prints it. In a real-world scenario, the random validation data should be replaced with actual validation data from the task you are working on. The validation loss can give you an indication of how well your model is performing on unseen data, which is a critical measure of the model's generalization ability.