In [225]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

import lightning as L

In [226]:
token_to_id= {
    "what": 0,
    "is": 1,
    "statquest":
    2, "awesome": 3,
    "<EOS>": 4
    }

In [227]:
id_to_token = dict(map(reversed, token_to_id.items()))
id_to_token

{0: 'what', 1: 'is', 2: 'statquest', 3: 'awesome', 4: '<EOS>'}

In [228]:
inputs = torch.tensor([
    [token_to_id["what"], token_to_id["is"], token_to_id["statquest"], token_to_id["<EOS>"], token_to_id["awesome"]],
    [token_to_id["statquest"], token_to_id["is"], token_to_id["what"], token_to_id["<EOS>"], token_to_id["awesome"]]
])

In [229]:
labels = torch.tensor([
    [token_to_id["is"], token_to_id["statquest"], token_to_id["<EOS>"], token_to_id["awesome"], token_to_id["<EOS>"]],
    [token_to_id["is"], token_to_id["what"], token_to_id["<EOS>"], token_to_id["awesome"], token_to_id["<EOS>"]]
])

In [230]:
dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset)

In [231]:
class PositionEncoding(nn.Module):
    def __init__(self, d_model=2, max_len=6): 
        # d_model is short for dimension of the model is the number of world embedding values per token
        # max_len is the max number of input tokens our transformer can process (input and output combined)
        super().__init__()

        pe = torch.zeros(max_len, d_model) # wCreate a matrix of Position Encoding values
        # we start by creating a matrix full of zero's - this will have lax_len rows and d_model columns
        # so if max_len = 6 and d_model is 2 our pe would look like this:
        # pe = torch([[0., 0.],
        #             [0., 0.],
        #             [0., 0.]]) 
        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1) # Create a column matrix, position, that reps the positions, pos, for each token
        # we use torch.arange to create a sequence of numbers between start=0 and end=max_len
        # .float() ensures that the numbers are floats
        # .unsqueeze(1) turns the sequence of numbers into a column matrix
        # For example, is max_len=3, we would get this column matrix:
        # tensor{[[0.],
        #         [1.],
        #         [2]]}
        embedding_index = torch.arange(start=0, end=d_model, step=2).float() # Create a row matrix that reps 'i', times 2, for each word embedding
        # we again use torch.arange to create a aequence of numbers, but this time between 0 and d_model
        # by putting step=2 would result in the same sequence of numbers if we multiplied i by 2, saving a little math

        div_term = 1/torch.tensor(10000.0)**(embedding_index / d_model)
        # each value in position, pos is divided by 10000^2i / d_model so we create div_term to represent the divisor

        # Now we do the math:
        pe[:, 0::2] = torch.sin(position * div_term)
        # the first term assigns values from the sine function to the matrix 'pe' from above, starting with the first column, 0 and then the
        # ::2 means every other column after that.
        pe[:, 1::2] = torch.cos(position * div_term)
        # the second term assigns values from the cosine function to the matrix 'e' starting witht eh second column, column 1, and then the
        # :: means every other column after that.
        self.register_buffer('pe', pe) # Moves 'pe' to a GPU if there is one.

    def forward(self, word_embeddings):
        # create a forard methen that takes in word embedding values and adds the position encoding values to the world_embedding values
        return word_embeddings + self.pe[:word_embeddings.size(0), :]

# Masked Self Attension
1. Calculate the Query, Key's and Values for each token
2. To code this math, we will use matrix notation

In [232]:
class Attention(nn.Module):
    def __init__(self, d_model=2):
        # We're passing in d_model, the dimension of the model, or the number of word embedding values per token.
        # We need to know the number of word embedding values per token because that defines how large the weight matrices are
        # that we use to create the Query, Key, Value.
        # So, if the dimenion is 2, that mens each weight matrix needs 2 rows and 2 columns,
        super().__init__()

        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        # We use Nn.Linear to create teh weight matrix and do the math for us
        # in_features defines how many rows are in the weight matrix - set to d_model
        # out_features defines how many columsn are in the weight matrix - set to d_model
        # W_q will give us the untrained weights to need to calculate the Query values
        # Since this is a Linear object, it not only does the weights but will do the math when the time comes.
        # Same for Keys and Values
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = 0
        self.col_dim = 1
        # To give us flexibility to input training data in sequenctially or batches, we create some variables to keep track of which indeces are for row and columsn

    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask=None):
    # Forwad method to valvulate the masked self attension values for each token

        q = self.W_q(encodings_for_q)
        k = self.W_k(encodings_for_k)
        v = self.W_v(encodings_for_v)
        # Now we calculate the Queary, Key and Value for each token by passing the encoding to each Lenear() object.
        # And now we are able to calulate Attention.

        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
        # We start my using torch matmul to multiply q by the transpose of k.
        # This calculates te similiaries between the Queries and the Keys which we save into 'sims'.  Softmax(QK^t/sqrt(dk) + M)V 
        
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)
        # Then we scale the similarities by the square root of the number values used in each key. Which 

        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)
            # The next thing we do is add the mask, if we're using one, to the scale the similarities
            # Masking is used by used to prevent early tokens from cheating and looking ahead at later tokens.
            # To nderstand how we add a mask using the masked_fill() method:
            # Let's imagine the mask is a matrix of True's and Falses
            # tensor([[False, True, True],
            #         [False, False, True],
            #         [False, False, False]])
            # And the True values above represent the attention values we want to ignore.  
            # So, the masked_fill() method replaces teh Trues with -1e9, which represents -1,000,000,000, an approx of -infinity and replaces the Falses with zero to create the final mask
            # that is added to the scaled similaries in scaled_sims.
            # tensor([[0, -1e9, -1e9],
            #         [0, 0, -1e9],
            #         [0, 0, 0]])

        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)
        # The next thing we to do calculate the Attention is run the scaled similarities through a softmax() function.
        # Applly the Softmax() function to the scaled similaries determines the percentages of influcence that each token should have on the others which is why
        # we store the results in a variable called attention_percents.
        attention_scores = torch.matmul(attention_percents, v)
        # Lastly, we used torch.matmul() to multiply the attention percentages by the Values in V and that gives us the final attention scores, stored in attention_scores which we turn.
        return attention_scores

Let's create a class that puts the first 3 steps together (word embedding, positional embedding, attention mechenism) and then we can add the residual connections. Then we'll run those values through a fully connected layer, and then runs them through a softmax to get the outputs.

In [233]:
class DecoderOnlyTransformer(L.LightningModule):
    # We'll use LightningModule 
    def __init__(self, num_tokens=4, d_model=2, max_len=6):
        # We create a init() method which allots use to speficiy
        # num_tokens = the number of tokens in the vocab
        # d_model = the nuber of values we want to represent each token
        # max_len = the max length of the input plus output.
        super().__init__()
        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
        # We create a Embedding() object and name it we for Word Embedding.
        # Embedding needs to know how many tokens are in the vocab, and the number of values we want to represent each token.
        self.pe = PositionEncoding(d_model=d_model, max_len=max_len)
        # Then we create a Position Encoding object using the class we created earlier and name it pe.
        self.self_attention = Attention(d_model=d_model)
        # Then we create our Attention object
        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)
        # Then we create the fully connected layer with nn.Linear().  nn.Linear needs to know how many inputs there are and how many outputs there are. 
        self.loss =  nn.CrossEntropyLoss()
        # Then we create a Loss Function to quantify how well the model performs. In this case, were using CrossEntropyLoss(), because our model has multiple outputs
        # and CrossEntroyLoss will apply the softmax function for us.
    def forward(self, token_ids):
        # Now we put all the pieces together in a forward method. Forward takes an array of token id numbers that will be used as inputs to the transformer.
        word_embeddings = self.we(token_ids)
        #First, we convert the tokens into Word Embedding values.
        position_encoding = self.pe(word_embeddings)
        # Then we add the position encoding 
        mask = torch.tril(torch.ones((token_ids.size(dim=0), token_ids.size(dim=0))))
        #Then we create the mask that will prevent early tokens from looking at late tokens when we calculate Attention.
        # We start by creating a matrix of 1s with torch.ones().  For example, if we are passing 4 token_ids in this forward method then the call to 
        # torch.ones() will make a matrix with 4 rows and 4 columns full of 1s.
        # That matrix of ones is then passed to troch.tril(), where it means Lower Triangle, because torch.tril() leaves the values in the Lower Triangle as they are and turns everything else into zeros.
        # Ultimatly, we save a matrix with 1s in the lower triangle and 0s in the upper triangle in a variable called mask. 
        mask = mask == 0
        # Then we use 'mask == 0' to convert the 0's into True's and the 1's into Falses. This will be the mask we will use for masked-self-attention.
        self_attention_values = self.self_attention(position_encoding,
                                                    position_encoding,
                                                    position_encoding,
                                                    mask=mask)
        # Once we have the max, we calculate the attention.  Because the Qeury, Key and Value matrices will all be calculated from the same token encoding, we pass in the same set of
        # of position encoded values 3 times for the Queries, Keys and Values. We also pass in the mask, so early tokens cant cheat and look ahead at later tokens.
        residual_connection_values = position_encoding + self_attention_values
        # Ten we add the resitual connections
        fc_layer_output = self.fc_layer(residual_connection_values)
        # Lastly, run everything through a fully connected layer.
        # Remmber, the loss fucntion we are using, Cross EntropyLoss(), does the Softmax() for us. So all we have to do is return the output of the fully connected layer.
        return fc_layer_output
    
    # Let's create a method to configure the optimizer we are using
    def configure_optimizer(self):
        return Adam(self.parameters(), lr=0.1)
        # In this case, we're using Adam, which is like Stochastic Gradient Descent, but a little less stochastic
        # and we are passing all of the Weights and Biases in the model that we want to train, which is all of them, to Adam.
        # We're setting the learning rate to 0.1 ebcasue it makes training this specific model very fast.  The default value of 0.001 is commonly used.

    def training_step(self, batch, batch_idx):
        # Then we create a training_stop() method which takes a batch of training data and an index for that batch.
        input_tokens, labels = batch
        # We then split the training data into inputs and labels
        output = self.forward(input_tokens[0])
        # Then we pass the input tokens into the forward() method that we just wrote to comput the output
        loss = self.loss(output, labels[0])
        # Then we compare the output from the Transformer to the known labele using the loss function
        return loss
        # And remember, the loss function does the softmax for us.

Let's run the model before training it, just to see what it does

In [234]:
model = DecoderOnlyTransformer(num_tokens=len(token_to_id), d_model=2, max_len=6)
# The first thing we do is create amodel from the DcoderOnly Transformer() class that we just created
model_input = torch.tensor([token_to_id["what"],
                            token_to_id["is"],
                            token_to_id["statquest"],
                            token_to_id["<EOS>"]])
# Then we create an input prompt, in this case, we're using 'What is StateQuest <EOS>
input_length = model_input.size(dim=0)
# Then we figure ou how many tokens we are using as input
# We do this because our super simple model can only handle a total of 6 tokens, aka max_len = 6.
# So, keeping track of how many tokens are in the input will tell us how many we can create as output.
predictions = model(model_input)
# Then we run that through the Transformer, which generates predictions for each token in the input.
# This mean, that the model generates a prediction for what should come after the first token, 'What' and for all of the other input tokens.
predicted_id = torch.tensor([torch.argmax(predictions[-1, :])])
# However, we're really just interestd in what the model predicts will come after the <EOS> token so we use -1 to index the outputs generated by the <EOS token.
# The outputs generated by the <EOS> token are an array of output values, one per possible output token. so we use the argmax function to identify the 
# output with the largest value. Thus, the token with the largest output value will be the first token generated as a response to the input.
predicted_ids = predicted_id
# Save that token so we can print it out later
max_length = 6
for i in range(input_length, max_length):
    # we then loop to keep generated output tokens until we reach the maxinum number of tokens that our model can generated
    if (predicted_ids == token_to_id["<EOS>"]):
        break
    # or the model generates the <EOS> token
    model_input = torch.cat((model_input, predicted_id))
    # Each time we generate a new output token, we add it to the input, so that each prediction is made with the full context.
    predictions = model(model_input)
    predicted_id = torch.tensor([torch.argmax(predictions[-1:])])
    # Then the model predicts the next output token using the full context, which is the input plus the output tokens so far.
    predicted_ids = torch.cat((predicted_ids, predicted_id))

    print("Predicted Tokens: \n")
    for id in predicted_ids:
        print("\t", id_to_token[id.item()])
    # Lastly, we print out the generated tokens after converting them from id numbers to text

Now, let's train the model

In [235]:
trainer = L.Trainer(max_epochs=60)
# First, we created a Lightning Tainer and tell it to only do 30 epochs, which is enough for our simple model and dataset
trainer.fit(model, train_dataloaders=dataloader)
# Then we pass our model and the dataloader we created earlier to the trainer using the fit() method.

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


MisconfigurationException: No `configure_optimizers()` method defined. Lightning `Trainer` expects as minimum a `training_step()`, `train_dataloader()` and `configure_optimizers()` to be defined.

In [236]:
model = DecoderOnlyTransformer(num_tokens=len(token_to_id), d_model=2, max_len=6)
# The first thing we do is create amodel from the DcoderOnly Transformer() class that we just created
model_input = torch.tensor([token_to_id["what"],
                            token_to_id["is"],
                            token_to_id["statquest"],
                            token_to_id["<EOS>"]])
# Then we create an input prompt, in this case, we're using 'What is StateQuest <EOS>
input_length = model_input.size(dim=0)
# Then we figure ou how many tokens we are using as input
# We do this because our super simple model can only handle a total of 6 tokens, aka max_len = 6.
# So, keeping track of how many tokens are in the input will tell us how many we can create as output.
predictions = model(model_input)
# Then we run that through the Transformer, which generates predictions for each token in the input.
# This mean, that the model generates a prediction for what should come after the first token, 'What' and for all of the other input tokens.
predicted_id = torch.tensor([torch.argmax(predictions[-1, :])])
# However, we're really just interestd in what the model predicts will come after the <EOS> token so we use -1 to index the outputs generated by the <EOS token.
# The outputs generated by the <EOS> token are an array of output values, one per possible output token. so we use the argmax function to identify the 
# output with the largest value. Thus, the token with the largest output value will be the first token generated as a response to the input.
predicted_ids = predicted_id
# Save that token so we can print it out later
max_length = 6
for i in range(input_length, max_length):
    # we then loop to keep generated output tokens until we reach the maxinum number of tokens that our model can generated
    if (predicted_ids == token_to_id["<EOS>"]):
        break
    # or the model generates the <EOS> token
    model_input = torch.cat((model_input, predicted_id))
    # Each time we generate a new output token, we add it to the input, so that each prediction is made with the full context.
    predictions = model(model_input)
    predicted_id = torch.tensor([torch.argmax(predictions[-1:])])
    # Then the model predicts the next output token using the full context, which is the input plus the output tokens so far.
    predicted_ids = torch.cat((predicted_ids, predicted_id))

    print("Predicted Tokens: \n")
    for id in predicted_ids:
        print("\t", id_to_token[id.item()])

Predicted Tokens: 

	 is
	 statquest


RuntimeError: Boolean value of Tensor with more than one value is ambiguous