In [44]:
%%capture 
# %%capture prevents this cell from printing a ton of STDERR stuff to the screen

## First, check to see if lightning is installed, if not, install it.
##
## NOTE: If you **do** need to install something, just know that you may need to
##       restart your session for python to find the new module(s).
##
##       To restart your session:
##       - In Google Colab, click on the "Runtime" menu and select
##         "Restart Session" from the pulldown menu
##       - In a local jupyter notebook, click on the "Kernel" menu and select
##         "Restart Kernel" from the pulldown menu
import pip
try:
  __import__("lightning")
except ImportError:
  pip.main(['install', "lightning"])
import os
os.environ['CUDA_VISIBLE_DEVICES']="-1"

In [45]:
import torch  ## torch let's us create tensors and also provides helper functions
import torch.nn as nn  ## torch.nn gives us nn.Module, nn.Embedding() and nn.Linear()
import torch.nn.functional as F  # This gives us the softmax() and argmax()
from torch.optim import Adam  # This is the optimizer we will use

import lightning as L  # Lightning makes it easier to write, optimize and scale our code
from torch.utils.data import (
    TensorDataset,
    DataLoader,
)  # We'll store our data in DataLoaders

In [46]:
## first, we create a dictionary that maps vocabulary tokens to id numbers...
token_to_id = {
    "what": 0,
    "is": 1,
    "statquest": 2,
    "awesome": 3,
    "<EOS>": 4,  ## <EOS> = end of sequence
}
## ...then we create a dictionary that maps the ids to tokens. This will help us interpret the output.
## We use the "map()" function to apply the "reversed()" function to each tuple (i.e. ('what', 0)) stored
## in the token_to_id dictionary. We then use dict() to make a new dictionary from the
## reversed tuples.
id_to_token = dict(map(reversed, token_to_id.items()))

## NOTE: Because we are using a Decoder-Only Transformer, the inputs contain
##       the questions ("what is statquest?" and "statquest is what?") followed
##       by an <EOS> token followed by the response, "awesome".
##       This is because all of those tokens will be used as inputs to the Decoder-Only
##       Transformer during Training. (See the illustration above for more details)
## ALSO NOTE: When we train this way, it's called "teacher forcing".
##       Teacher forcing helps us train the neural network faster.
inputs = torch.tensor(
    [
        [
            token_to_id["what"],  ## input #1: what is statquest <EOS> awesome
            token_to_id["is"],
            token_to_id["statquest"],
            token_to_id["<EOS>"],
            token_to_id["awesome"],
        ],
        [
            token_to_id["statquest"],  # input #2: statquest is what <EOS> awesome
            token_to_id["is"],
            token_to_id["what"],
            token_to_id["<EOS>"],
            token_to_id["awesome"],
        ],
    ]
)

## NOTE: Because we are using a Decoder-Only Transformer the outputs, or
##       the predictions, are the input questions (minus the first word) followed by
##       <EOS> awesome <EOS>.  The first <EOS> means we're done processing the input question
##       and the second <EOS> means we are done generating the output.
##       See the illustration above for more details.
labels = torch.tensor(
    [
        [
            token_to_id["is"],
            token_to_id["statquest"],
            token_to_id["<EOS>"],
            token_to_id["awesome"],
            token_to_id["<EOS>"],
        ],
        [
            token_to_id["is"],
            token_to_id["what"],
            token_to_id["<EOS>"],
            token_to_id["awesome"],
            token_to_id["<EOS>"],
        ],
    ]
)

## Now let's package everything up into a DataLoader...
dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset)

In [47]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_len,d_token):
        super().__init__()
        pe=torch.zeros(max_len,d_token)
        
        ## PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
        ## PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
        # Calculate 1/1000^(2i/d_model) where i is the index of current token embedding value right now we have d_model =2 so we iterate from 0,1
        # and as we have max_len of 3 meaning three unique tokens represented by 3*2 =6 unique values for 1st token Let's we will have
        # one sin and cos based value
        position=torch.arange(start=0,end=max_len,step=1).float().unsqueeze(1)
        div_term_values=1/10000**((2*torch.arange(start=0,end=d_token,step=2).float()/d_token))
        pe[:,0::2]=torch.sin(position*div_term_values)
        pe[:,1:2]=torch.cos(position*div_term_values)
        self.register_buffer('pe',pe)
    def forward(self,x):
        return x+self.pe[:x.size(0),:]
        
        

In [48]:
div_term_values=1/10000**((2*torch.arange(start=0,end=2,step=2).float()/2))

print(0 / div_term_values)
print(1 / div_term_values)
print(2 / div_term_values)
print(torch.zeros(3, 2))

tensor([0.])
tensor([1.])
tensor([2.])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])


In [49]:
div_term = 1/torch.tensor(10000.0)**(torch.arange(start=0, end=2, step=2).float() / 2)
div_term

tensor([1.])

In [50]:
class Attention(nn.Module):
    def __init__(self, max_len,d_model):
        super().__init__()
        self.q_w=nn.Linear(d_model,d_model)
        self.k_w=nn.Linear(d_model,d_model)
        self.v_w=nn.Linear(d_model,d_model)
        self.row_dim=0
        self.col_dim=1
    def forward(self,encodings_q,encodings_k,encodings_v,mask=None):
        q=self.q_w(encodings_q)
        k=self.k_w(encodings_k)
        v=self.v_w(encodings_v)
        sims=torch.matmul(q,k.transpose(self.row_dim,self.col_dim))
        scaled_sims=sims/(q.size(self.col_dim)**0.5)
        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)
        attention_scores = torch.matmul(attention_percents, v)
        return attention_scores

In [51]:
class Decoder(L.LightningModule):
    def __init__(self, num_tokens,d_model,max_len):
        super().__init__()
        L.seed_everything(seed=42)
        self.embeddings=nn.Embedding(num_tokens,d_model)
        self.positional_encodings = PositionalEncoding(max_len=max_len, d_token=d_model)
        self.masked_attention = Attention(max_len, d_model)
        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)

        self.loss = nn.CrossEntropyLoss()
    def forward(self,token_ids):
        embeddings = self.embeddings(token_ids)
        pe = self.positional_encodings(embeddings)
        mask = torch.tril(torch.ones((token_ids.size(dim=0), token_ids.size(dim=0))))
        mask=mask==0
        attention_outpus = self.masked_attention(pe,pe,pe,mask)
        residual_values=attention_outpus+pe
        outputs=self.fc_layer(residual_values)
        return outputs
    def configure_optimizers(self):
        ## configure_optimizers() simply passes the parameters we want to
        ## optimize to the optimzes and sets the learning rate
        return Adam(self.parameters(), lr=0.1)

    def training_step(self, batch, batch_idx):
        ## training_step() is called by Lightning trainer when
        ## we want to train the model.
        input_tokens, labels = batch # collect input
        output = self.forward(input_tokens[0])
        loss = self.loss(output, labels[0])

        return loss

In [52]:
mask = torch.tril(torch.ones((4, 4)))
mask

tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 1.]])

In [53]:
## First, create a model from DecoderOnlyTransformer()
model = Decoder(num_tokens=len(token_to_id), d_model=2, max_len=6)

## Now create the input for the transformer...
model_input = torch.tensor(
    [
        token_to_id["what"],
        token_to_id["is"],
        token_to_id["statquest"],
        token_to_id["<EOS>"],
    ]
)
input_length = model_input.size(dim=0)

## Now get get predictions from the model
predictions = model(model_input)
## NOTE: "predictions" is the output from the fully connected layer,
##      not a softmax() function. We could, if we wanted to,
##      Run "predictions" through a softmax() function, but
##      since we're going to select the item with the largest value
##      we can just use argmax instead...
## ALSO NOTE: "predictions" is a matrix, with one row of predicted values
##      per input token. Since we only want the prediction from the
##      last row (the most recent prediction) we use reverse index for the
##      row, -1.
predicted_id = torch.tensor([torch.argmax(predictions[-1, :])])
## We'll store predicted_id in an array, predicted_ids, that
## we'll add to each time we predict a new output token.
predicted_ids = predicted_id

## Now use a loop to predict output tokens until we get an
## <EOS> token.
max_length = 6
for i in range(input_length, max_length):
    if (
        predicted_id == token_to_id["<EOS>"]
    ):  # if the prediction is <EOS>, then we are done
        break

    model_input = torch.cat((model_input, predicted_id))

    predictions = model(model_input)
    predicted_id = torch.tensor([torch.argmax(predictions[-1, :])])
    predicted_ids = torch.cat((predicted_ids, predicted_id))

## Now printout the predicted output phrase.
print("Predicted Tokens:\n")
for id in predicted_ids:
    print("\t", id_to_token[id.item()])

Predicted Tokens:

	 what
	 what
	 what


In [54]:
trainer = L.Trainer(max_epochs=30)
trainer.fit(model, train_dataloaders=dataloader)

/home/hadi/Documents/statquest/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/home/hadi/Documents/statquest/.venv/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [55]:
model_input = torch.tensor(
    [
        token_to_id["what"],
        token_to_id["is"],
        token_to_id["statquest"],
        token_to_id["<EOS>"],
    ]
)
input_length = model_input.size(dim=0)

predictions = model(model_input)
predicted_id = torch.tensor([torch.argmax(predictions[-1, :])])
predicted_ids = predicted_id

for i in range(input_length, max_length):
    if (
        predicted_id == token_to_id["<EOS>"]
    ):  # if the prediction is <EOS>, then we are done
        break

    model_input = torch.cat((model_input, predicted_id))

    predictions = model(model_input)
    predicted_id = torch.tensor([torch.argmax(predictions[-1, :])])
    predicted_ids = torch.cat((predicted_ids, predicted_id))

print("Predicted Tokens:\n")
for id in predicted_ids:
    print("\t", id_to_token[id.item()])

Predicted Tokens:

	 awesome
	 <EOS>


In [56]:
## Now let's ask the other question...
model_input = torch.tensor(
    [
        token_to_id["statquest"],
        token_to_id["is"],
        token_to_id["what"],
        token_to_id["<EOS>"],
    ]
)
input_length = model_input.size(dim=0)

predictions = model(model_input)
predicted_id = torch.tensor([torch.argmax(predictions[-1, :])])
predicted_ids = predicted_id

for i in range(input_length, max_length):
    if (
        predicted_id == token_to_id["<EOS>"]
    ):  # if the prediction is <EOS>, then we are done
        break

    model_input = torch.cat((model_input, predicted_id))

    predictions = model(model_input)
    predicted_id = torch.tensor([torch.argmax(predictions[-1, :])])
    predicted_ids = torch.cat((predicted_ids, predicted_id))

print("Predicted Tokens:\n")
for id in predicted_ids:
    print("\t", id_to_token[id.item()])

Predicted Tokens:

	 awesome
	 <EOS>
