In [73]:
import pip
try:
    __import__("lightning")
except ImportError:
    pip.main(["install", "lightning"])
import os

In [48]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import lightning as L
import torch.nn.functional as F

In [74]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [75]:
input_vocab={
    '<SOS>': 0,
    'lets':1,
    'to':2,
    'go':3,
}
output_vocab={
    '<SOS>': 0,
    'ir':1,
    'vamos':2,
    'y':3,
    '<EOS>': 4,
}
inputs=torch.tensor([[1,3],[2,3]])
labels=torch.tensor([[2],[1]])

dataset=TensorDataset(inputs, labels)
dataloader=DataLoader(dataset)

In [76]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        # Initialize the positional encoding module.
        # d_model: Dimension of the model.
        # max_len: Maximum length of the input sequence.
        # This module generates positional encodings for input sequences.
        # It uses sine and cosine functions to encode the positions of tokens in the sequence.
        # The positional encodings are added to the input embeddings to provide information about the position of
        # each token in the sequence.
        # The positional encodings are computed using sine and cosine functions of different frequencies.
        # The positional encodings are stored in a tensor of shape (max_len, d_model).
        # The positional encodings are computed using the formula:
        # PE(pos, 2i) = sin(pos / (10000^(2i / d_model)))
        # PE(pos, 2i+1) = cos(pos / (10000^(2i / d_model)))
        # where pos is the position of the token in the sequence, i is the dimension index
        # and d_model is the dimension of the model.
        super().__init__()
        pe=torch.zeros(max_len, d_model)
        # Create a tensor to hold the positional encodings. 0,1,2 # ... max_len-1 are the positions in the sequence.
        # The tensor is initialized to zeros with shape (max_len, d_model).
        # each index will represent the position of a token in the sequence.
        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
        # So for each position we will have two values, one for sine and one for cosine. This will be extended based on the d_model.
        # The position tensor is created using torch.arange, which generates a sequence of numbers from 0 to max_len-1.
        # The unsqueeze(1) operation adds a new dimension to the tensor, making it a column vector.
        # This is necessary for broadcasting when computing the sine and cosine functions.
        # The position tensor will have shape (max_len, 1), where each row corresponds to a position in the sequence.
        # To simply the computation and for the sake of clarity we compute the div term first: 
        # 1/ (10000^(2i / d_model))
        div_term=1/torch.tensor(10000**(2*torch.arange(start=0, end=d_model, step=2).float()/d_model))

        # Now we can compute the sine and cosine functions for each position and dimension.
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)
    def forward(self, x):
        # Forward pass of the positional encoding module.
        # x: Input tensor of shape (batch_size, seq_len, d_model).
        # Returns the input tensor with positional encodings added.
        # The positional encodings are added to the input embeddings to provide information about the position of
        # each token in the sequence.
        # The input tensor x is expected to have shape (batch_size, seq_len, d_model).
        # The positional encodings are added to the input tensor along the last dimension (d_model).
        return x + self.pe[:x.size(0), :]

Take a step to rethink why we do 0::2 or 1::2 above...1,2,3
position is position of the input sequence which has a length of 3, with 0,1,2 indices
d_model is 2, 0,1 indices
all rows from even columns for sin
all rows odd columns for cos
This gives us 3 div_terms, which we use for:

```
dim 0 → sin(... / div_term[0])

dim 1 → cos(... / div_term[0])

dim 2 → sin(... / div_term[1])

dim 3 → cos(... / div_term[1])

dim 4 → sin(... / div_term[2])

dim 5 → cos(... / div_term[2])
```
See how div_term[i] is used for both dim 2i and 2i+1?

In [77]:
max_len = 3
d_model = 2
div_term=1/torch.tensor(10000**(torch.arange(start=0, end=d_model, step=2).float()/d_model))
div_term

  div_term=1/torch.tensor(10000**(torch.arange(start=0, end=d_model, step=2).float()/d_model))


tensor([1.])

### Now we will code attention
Recap on Attention is availble in seq2seq wit attention code

We will code:
* Self Attention
* Masked Self Attention
* Encoder Decoder Attention



## Self Attention

Self attention allows us to find relationship of every word with every other word in the sequence/phrase including the word for which we are calculating attention with itself.


PE_values.Q_w=Q

PE_values.K_w=K

PE_values.V_w=V

Then we multiply Q with K to get similarity of each word in query iwth the keys for all of the words

Then we normalize the similarities matrix normaly by /sqrt(2)

Scaled Similarities=Q.KT=Similarities/sqrt(2)

Apply softmax get scaled similarities 

then multiply these matrix with V to get attention scores by multiplies percentages in similarity matrixes with V


In [78]:
class SelfAttention(nn.Module):
    def __init__(self,d_model):
        super(SelfAttention, self).__init__()
        self.q_w= nn.Linear(d_model, d_model, bias=False)
        self.k_w= nn.Linear(d_model, d_model, bias=False)
        self.v_w= nn.Linear(d_model, d_model, bias=False)
        self.row_dim=0
        self.col_dim=1
    def forward(self, encodings_for_q,encodings_for_k, encodings_for_v,mask=None):
        # For normal self attention encodings_for_q, encodings_for_k and encodings_for_v are same.
        # For encoder-decoder attention encodings_for_q are from decoder and encodings_for_k and encodings_for_v are from encoder.
        q=self.q_w(encodings_for_q)
        k=self.k_w(encodings_for_k)
        v=self.v_w(encodings_for_v)

        # calculate similarity scores
        #  # q is the query, k is the key, v is the value.
        # q*k^T/sqrt(d_model)
        ## NOTE: It seems most people use "reverse indexing" for the dimensions when transposing k
        ##       k.transpose(dim0, dim1) will transpose k by swapping dim0 and dim1
        ##       In standard matrix notation, we would want to swap rows (dim=0) with columns (dim=1)
        ##       If we have 3 dimensions, because of batching, and the batch was the first dimension
        ##       And thus dims are defined batch = 0, rows = 1, columns = 2
        ##       then dim0=-2 = 3 - 2 = 1. dim1=-1 = 3 - 1 = 2.
        ##       Alternatively, we could put the batches in dim 3, and thus, dim 0 would still be rows
        ##       and dim 1 would still be columns. I'm not sure why batches are put in dim 0...
        ##
        ##       Likewise, the q.size(-1) uses negative indexing to reverse to the number of columns in the query
        ##       which tells us d_model. Alternatively, we could ust q.size(2) if we have batches in the first
        ##       dimension or q.size(1) if we have batches in the 3rd dimension.
        ##
        ##       Since there are a bunch of ways to index things, I think the best thing to do is use
        ##       variables "row_dim" and "col_dim" instead of numbers...
        sims=torch.matmul(q,k.transpose(self.row_dim, self.col_dim))
        scaled_dims=sims/torch.tensor(q.size(self.col_dim)**0.5)
        if mask is not None:
            scaled_dims=scaled_dims.masked_fill(mask=mask, value=-1e9)
        # scaled_dims is the similarity scores between q and k.
        attention_percents=F.softmax(scaled_dims, dim=self.col_dim)
        # attention_percents is the attention distribution over the keys.
        attention_scores=torch.matmul(attention_percents, v)
        return attention_scores

Encoder
We need to combiine
* PE
* Self Attention
* Residual Connections

How 
Sim=PE * Self Attention
Sim+ PE

In [79]:
class Encoder(nn.Module):
    def __init__(self, num_tokens=4,d_model=2,max_len=3):
        super().__init__()
        L.seed_everything(42)
        # To stack more encoders output of one encoder will be input to other encoder
        self.we=nn.Embedding(num_tokens,embedding_dim=d_model)
        self.pe=PositionalEncoding(d_model,max_len)
        self.self_attention=SelfAttention(d_model)
        # Notes from Statquest on multi head attention
        # We can have that by initializing multiple attention objects
        # This will result in multiple attention values
        # FOr example
        ##
        ## self.self_attention_2 = Attention(d_model=d_model)
        ## self.self_attention_3 = Attention(d_model=d_model)
        ##
        # We have d_model=2 then using 3 self attention values result will be 2*3=6 self attention values per token
        # So we need to have a matrix of attention values and reduce those to d_model=2 dimensions
        # Get a 3*2 matrix and output 2,1 vector by doing a  linear operation
        # self.reduce_attention_dim=nn.Linear(in_features=(num_attention_heads*d_model),out_features=d_model)
    def forward(self,token_ids):
        embeddings=self.we(token_ids)
        position_encodings=self.pe(embeddings)
        self_attention=self.self_attention(position_encodings,position_encodings,position_encodings)
        # For multi head attention here we will do attention calculation
        # self_atten_val_2=self.self_attention_2(..)
        # self_atten_val_3=self.self_attention_3(..)
        # Lastly concatenating all the attention values using torch.concat()
        # and running through reduction layer
        # final_self_attention_layer=self.reduce_attention_dim(final_attention_values)
        # Then lasting adding the attention to position_encodings and returning the final output
        output_values=self_attention+position_encodings

        return output_values

### Decoder
For a decoder to work we have to implement the following in order:
* Positional Encoding
* Self_attentoion (Masked)
* Residul Connection
* Encoder_decoder Attention
* Fully Conencted Layer
* Softmax- (CrossEntropyLoss as crossentropyloss implements the softmax for us)


In [80]:
class Decoder(nn.Module):
    def __init__(self, num_tokens=4,d_model=2,max_len=3):
        super().__init__()
        self.we=nn.Embedding(num_embeddings=num_tokens,embedding_dim=d_model)
        self.pe=PositionalEncoding(d_model=d_model,max_len=max_len)
        self.self_attention=SelfAttention(d_model=d_model)
        self.enc_dec_attention=SelfAttention(d_model=d_model)
        self.fc_layer=nn.Linear(in_features=d_model,out_features=num_tokens)
        self.row_dim=0
        self.col_dim=1
    def forward(self,token_ids,encoder_values):
        word_embeddings=self.we(token_ids)
        postional_encoding=self.pe(word_embeddings)
        # Now we need to mask the self attention values so that when we are training
        # model cannot look ahead and see tokens to be predicted and cheat
        # We do this by creating a matrix mask where lower triangle is filled with 0 and everything else with values other than 0
        # Then we replace the 0s above diagonal, which represent values we want to be masked out with True and replace 1s in lower triangle
        # which represent the words we want to include to calculate self attention for a specific word in output with false
        # mask=torch.tril((token_ids.size(dim=self.row_dim),token_ids.size(dim=self.row_dim)))
        mask = torch.tril(torch.ones((token_ids.size(dim=self.row_dim), token_ids.size(dim=self.row_dim))))

        mask=mask==0
        self_attention_values=self.self_attention(postional_encoding,postional_encoding,postional_encoding,mask=mask)
        residual_connection_values=self_attention_values+postional_encoding
        enc_dec_attention=self.enc_dec_attention(residual_connection_values,encoder_values,encoder_values)
        residual_connection_values=enc_dec_attention+residual_connection_values
        fc_layer_output=self.fc_layer(residual_connection_values)
        return fc_layer_output
        

Now that we have coded up the Encoder() and Decoder() classes, all that's left is to code up a Transformer() that connects the two.


The Transformer Class
The Transformer() class simply connects the outputs from the Encoder to the Decoder, as seen in the figure below.

In [81]:
from torch.optim import Adam
class Transformer(L.LightningModule):

    def __init__(self, input_size, output_size, d_model=2, max_len=3):

        super().__init__()

        self.encoder = Encoder(
            num_tokens=len(input_vocab), d_model=d_model, max_len=max_len
        )
        self.decoder = Decoder(
            num_tokens=len(output_vocab), d_model=d_model, max_len=max_len
        )

        self.loss = nn.CrossEntropyLoss()

    def forward(self, inputs, labels):

        encoder_values = self.encoder(inputs)
        output_presoftmax = self.decoder(labels, encoder_values)

        return output_presoftmax

    def configure_optimizers(self):

        return Adam(self.parameters(), lr=0.1)
    def training_step(self, batch, batch_idx): 
        
        input_i, label_i = batch # collect input
        
        ## First, let's append the <SOS> token to tokens used as input to the Encoder...
        input_tokens = torch.cat((torch.tensor([0]), input_i[0]))
        
        ## ...and to the tokens used as input to the decoder.
        teacher_forcing = torch.cat((torch.tensor([0]), label_i[0]))
        
        ## Now let's add the <EOS> token to the end of the known output
        expected_output = torch.cat((label_i[0], torch.tensor([4])))
                
        output_i = self.forward(input_tokens, teacher_forcing)
        loss = self.loss(output_i, expected_output)
                    
        return loss

Test Transformer Block

In [82]:
max_length=3
transformer=Transformer(len(input_vocab),output_size=len(output_vocab),max_len=max_len)
encoder_values=transformer.encoder(torch.tensor([0,1,3])) # <SOS> let's go
# Then we initialize decoder with <EOS> -> [0] from decoder
predicted_ids=torch.tensor([0])
for i in range(max_length):
    prediction=transformer.decoder(predicted_ids,encoder_values)
    predicted_id=torch.tensor([torch.argmax(prediction[-1,:])])
    predicted_ids=torch.cat([predicted_ids,predicted_id])
    if (predicted_id==4): # if EOS token we are done
        break
print("\n Predicted ids",predicted_ids)

Seed set to 42



 Predicted ids tensor([0, 2, 4])


  div_term=1/torch.tensor(10000**(2*torch.arange(start=0, end=d_model, step=2).float()/d_model))


Train the Transformer!!!


In [83]:
transformer = Transformer(len(input_vocab), len(output_vocab), d_model=2, max_len=3)

Seed set to 42
  div_term=1/torch.tensor(10000**(2*torch.arange(start=0, end=d_model, step=2).float()/d_model))


In [84]:
trainer = L.Trainer(max_epochs=30)
trainer.fit(transformer, train_dataloaders=dataloader)

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | encoder | Encoder          | 20     | train
1 | decoder | Decoder          | 49     | train
2 | loss    | CrossEntropyLoss | 0      | train
-----------------------------------------------------
69        Trainable params
0         Non-trainable params
69        Total params
0.000     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode
/home/hadi/Documents/statquest/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of t

Training: |          | 0/? [00:00<?, ?it/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
`Trainer.fit` stopped: `max_epochs=30` reached.


In [85]:
## First, a reminder of our input and output vocabularies...
# input_vocab = {'<SOS>': 0, # Start
#                'lets': 1,
#                'to': 2,
#                'go': 3}

# output_vocab = {'<SOS>': 0, # Start
#                 'ir': 1,
#                 'vamos': 2,
#                 'y': 3,
#                 '<EOS>': 4} # End

max_length = 3
row_dim = 0
col_dim = 1

## Encode the user input...
encoder_values = transformer.encoder(
    torch.tensor([0, 1, 3])
)  # <SOS> let's go # Expecting: 0, 2, 4 = <SOS> vamos <EOS>
# encoder_values = transformer.encoder(torch.tensor([0, 2, 3])) # <SOS> to go  # Expecting: 0, 1, 4 = <SOS> ir <EOS>

## Since we initialize the decoder with the <SOS> token, we
## can consider that <SOS> to be the first predicted token
predicted_ids = torch.tensor(
    [0]
)  # set the first predicted token to <SOS> to initialize the decoder
for i in range(max_length):
    ## given the current predicted tokens and the encoded input,
    ## predict the next token with the decoder
    ## NOTE: "prediction" is the output from the fully connected layer,
    ##      not a softmax() function. We could, if we wanted to,
    ##      Run "prediction" through a softmax() function, but
    ##      since we're going to select the item with the largest value
    ##      we can just use argmax instead...
    prediction = transformer.decoder(predicted_ids, encoder_values)

    ## Use argmax() to select the id of the predicted token
    predicted_id = torch.tensor([torch.argmax(prediction[-1, :])])
    ## add the predicted token id to the list of predicted ids.
    predicted_ids = torch.cat((predicted_ids, predicted_id))

    if predicted_id == 4:  # if the prediction is <EOS>, then we are done
        break

print("\npredicted_ids:", predicted_ids)


predicted_ids: tensor([0, 2, 4])
