In [2]:
import torch # for core funtionality , gives easy simple structure for NN , uses tensors as DATA STRUCTURES 
import torch.nn as nn # for neural networks
import torch.optim # for training netwrokssss
import math  # maths ops 
import copy # copying complex objects 

In [3]:
class MultiHeadAttention(nn.Module): # nn.Module is base class for all models 
    def __init__( self, d_model, num_heads): 

        # d_model===> model's dimensions and num_heads===> refer to multiple sets of learned linear transformations applied to the input query, key, and value vectors, enabling the model to capture different aspects of the input sequence in parallel. Each head independently performs attention calculations, and the outputs are then combined, typically through concatenation followed by another linear transformation. 
        super(MultiHeadAttention, self).__init__()

        assert d_model % num_heads==0 # assert is used to verify a condition is true or not in the code and here we want d_model(model's dimensions) to be divisible by the num_heads

        # dimensions ko initialize karo

        self.d_model= d_model
        self.num_heads= num_heads
        self.d_k= d_model // num_heads # DIMENSION OF each head's key, query and value


        self.W_q= nn.Linear(d_model, d_model) # qyery transs
        self.W_k= nn.Linear(d_model, d_model) # key 
        self.W_v= nn.Linear(d_model, d_model) # value
        self.W_o= nn.Linear(d_model, d_model) # last mai output


In [4]:
# Scaled dot product attention which basically finds relation between input words

def scaled_dot_product_attention(self, Q, K, V, mask=None):

    attn_scores= torch.matmul(Q, K.transpose(-2,-1))/ math.sqrt(self.self.d_k) # matmul for Matrix product of two tensors.

    if mask is not None:      # Apply mask if provided (useful for preventing attention to certain parts like padding)

        attn_scores= attn_scores.masked_fill(mask==0, -1e9)

    attn_probs= torch.softmax(attn_scores, dim=1)     # softmax for attention ko probabilities mai convert karne kelie that sums to 1 

    output = torch.matmul(attn_probs, V)       # matmul final O/P
    return output    



def split_heads(self, x): #split kiya heads ko 
    batch_size, seq_length, d_model= x.size()
    return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1,2)


def combine_heads(self,x): # yahan concat kiya multiple heads ka result

    batch_size, _, seq_length, d_k= x.size()
    return x.transpose(1,2).contiguous().view(batch_size, seq_length, self.d_model)

def forward(self, Q, K, V, mask=None): # method applies linear transformations, splits the heads, performs scaled dot-product attention, combines the heads, and returns the output.
    Q= self.split_heads(self.W_q(Q)) # splitting and multiplying with learnable weights
    K= self.split_heads(self.W_k(K))
    V= self.split_heads(self.W_v(V))

    attn_output=self.scaled_dot_product_attention( Q, K, V, mask) # DOT PRODUCT CALL KIYA 

    output= self.W_o(self.combine_heads(attn_output))
    return output



In summary, the MultiHeadAttention class encapsulates the multi-head attention mechanism commonly used in transformer models. It takes care of splitting the input into multiple attention heads, applying attention to each head, and then combining the results. By doing so, the model can capture various relationships in the input data at different scales, improving the expressive ability of the model.

In [None]:
#position wise feed forward network

class PositonWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff): #d_ff for dimension of feed forwrd layers
        super(PositonWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc1 = nn.Linear(d_ff, d_model)
        self.relu= nn.ReLU() # RELU ACTIVATION FUNCTION FOR NON L;INEARITY
    def forward(self,x): # x for input to feed forward
        return self.fc2(self.relu(self.fc1(x))) # firstly passed through first linear layer with input x and then passed through RELU activation function and then passed through 2nd linear layer

## Encoding 
is a broader term for transforming data into a numerical representation, often for basic processing, while 
## Embedding 
is a more sophisticated technique that maps data into a vector space, capturing semantic relationships and similarities between data points. 

In [None]:
# positional encoding for injecting position information of each token in the input sequence using sine and cosine funtions to generate positional encoding

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe= torch.zeros(max_seq_length, d_model) # A tensor filled with zeros, which will be populated with positional encodings
        position= torch.arange(0,max_seq_length, dtype=torch.float).unsqueeze(1) #: A tensor containing the position indices for each position in the sequence.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)) # used to scale the position indices in a specific way.

        pe[:, 0::2]= torch.sin(position * div_term)
        pe[:, 1::2]= torch.cos(position * div_term)

        self.register_buffer('pe',pe.unsqueeze(0)) #he sine function is applied to the even indices and the cosine function to the odd indices of pe.
#Finally, pe is registered as a buffer, which means it will be part of the module's state but will not be considered a trainable parameter.

        def forward(self, x):
            return x + self.pe[:, :x.sin(1)] #uses the first x.size(1) elements of pe to ensure that the positional encodings match the actual sequence length of x.

## The PositionalEncoding 
class adds information about the position of tokens within the sequence. Since the transformer model lacks inherent knowledge of the order of tokens (due to its self-attention mechanism), this class helps the model to consider the position of tokens in the sequence. The sinusoidal functions used are chosen to allow the model to easily learn to attend to relative positions, as they produce a unique and smooth encoding for each position in the sequence.

In [7]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn= MultiHeadAttention( d_model, num_heads)
        self.feed_forward= PositonWiseFeedForward(d_model, d_ff)
        self.norm1= nn.LayerNorm(d_model) # layer normalisation to smooth the input
        self.norm2= nn. LayerNorm(d_model)
        self.dropout= nn.Dropout(dropout) #Dropout layer, used to prevent overfitting by randomly setting some activations to zero during training.

    def forward(self, x, mask):
        attn_output= self.self_attn(x,x,x, mask) # Optional mask to ignore certain parts of the input.
        x= self.norm1(x + self.dropout(attn_output))   
        ff_output= self.feed_forward(x)
        x= self.norm2(x + self.dropout(ff_output))
        return x

## The EncoderLayer 
class defines a single layer of the transformer's encoder. It encapsulates a multi-head self-attention mechanism followed by the position-wise feed-forward neural network, with residual connections, layer normalization, and dropout applied as appropriate. Together, these components allow the encoder to capture complex relationships in the input data and transform them into a useful representation for downstream tasks. Typically, multiple such encoder layers are stacked to form the complete encoder part of a transformer model.