In [1]:
import torch
from utils import GPTModel


GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

- We will adopt the `generate_text_simple` function from the previous chapter and introduce 2 new functions.
- `text_to_token_ids` and `token_ids_to_text`. These functions will help the conversion between text and token representations.

In [2]:
import tiktoken
from utils import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [3]:
# Example
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M['context_length']
)

print(f"Output Text: {token_ids_to_text(token_ids, tokenizer)}")

Output Text: Every effort moves you rentingetic wasnم refres RexAngel infieldcigans


- The output is gibberish
- We will implement a numerical method to evaluate the generated content. This will allow is to monitor and enhance the model's performance throughout the training process.

# Calculating the text generation loss

In [10]:
# INPUTS
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
inputs = []
txt1 = "every effort moves"
txt2 = "I really like"

inputs.append(torch.tensor(tokenizer.encode(txt1)))
inputs.append(torch.tensor(tokenizer.encode(txt2)))
inputs = torch.stack(inputs, dim=0)
print("INPUTS:\n", inputs)


#TARGETS
targets = []
txt1 = " effort moves you"
txt2 = " really like chocolate"

targets.append(torch.tensor(tokenizer.encode(txt1)))
targets.append(torch.tensor(tokenizer.encode(txt2)))
targets = torch.stack(targets, dim=0)
print("OUTPUTS:\n", targets)


INPUTS:
 tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])
OUTPUTS:
 tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])


In [11]:
# Lets get the logits
with torch.no_grad():
    logits = model(inputs)
    probas = torch.softmax(logits, dim=-1)
    print(probas.shape)

torch.Size([2, 3, 50257])


In [12]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("TOKEN IDS:\n", token_ids)

TOKEN IDS:
 tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [13]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1:"
      f" {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


In [14]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2]]

tensor([[[1.9582e-05, 1.5537e-05, 1.1597e-05,  ..., 2.2041e-05,
          7.0134e-06, 1.8575e-05],
         [9.3378e-06, 1.0149e-05, 7.7960e-06,  ..., 2.8831e-05,
          6.1058e-06, 1.2983e-05],
         [2.8943e-05, 8.6889e-06, 1.5495e-05,  ..., 3.6617e-05,
          1.3867e-05, 1.2969e-05]],

        [[1.2452e-05, 2.0276e-05, 1.3734e-05,  ..., 1.0147e-05,
          3.4725e-05, 1.3873e-05],
         [7.3622e-06, 1.7367e-05, 1.0464e-05,  ..., 2.1105e-05,
          1.1502e-05, 1.4995e-05],
         [2.8950e-05, 3.3374e-05, 4.1859e-05,  ..., 6.5971e-06,
          5.7726e-05, 1.3385e-05]]])

In [31]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)
text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([7.2671e-05, 3.1046e-05, 1.1696e-05])
Text 2: tensor([1.0426e-05, 5.4604e-05, 4.7716e-06])


- Above are the probabilities of the each input token in the above 2 text examples.

# Back propagation
- Used to update the model weights
- Compute the negative log likelihood.

In [32]:
# Calculate log of each token across the entire batch
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

tensor([ -9.5296, -10.3800, -11.3563, -11.4712,  -9.8154, -12.2528])


In [33]:
# Calculate the average of the log
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-10.8009)


In [34]:
# Multiply by negative 1
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor(10.8009)


- This process performed above is known as cross-entropy loss.
- Pytorch has an in-built function to do that.

In [35]:
print("Logits shape:", logits.shape)
print("Targets shape:", targets.shape)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


- Logits are 3-D (batch_size, number of tokens, vocab-size)
- The targets tensor has 2-D (batch_size, number of tokens)
- For the cross_entropy we have to flatten by combining them over the batch dimension


In [37]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()
print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [40]:
# Using pytorch to calculate the loss
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(10.8009)


# Perplexity
- Its a measure that is often used to evaluate the performance of models in tasks like language modeling.
- Provides a more interpretable way to understand the uncertainity of model in predicting the next token in a sequence.
- Perplexity measures how well the probability distribution predicted by the model matches the actual distribution of words in the dataset. Similar to loss, a lower perplexity indicated that the model predictions are closer to the actual distribution.
- It signifies the effective vocabulary size about which the model is uncertain at each step.
- This would translate to the model being unsure about which among all the tokens in the vocabulary to generate as the next token.

In [42]:
perplexity = torch.exp(loss)
print(perplexity)

tensor(49064.1641)


So in the above excercise we have computed the loss and perplexity to 2 inputs. But we will now extend it to the entire training and validation sets.