In [88]:
from importlib.metadata import version

pkgs = [
    "matplotlib",
    "numpy",
    "tiktoken",
    "torch",
    # "tensorflow", # for open ai pretrained wights
]

for p in pkgs:
    print(f"{p}: {version(p)}")

matplotlib: 3.10.5
numpy: 2.3.2
tiktoken: 0.9.0
torch: 2.2.2


In [89]:
from gpt_helpers import GPTModel

GPT_CONFIG_124M = {
    'vocab_size': 50257,  # vocab size
    'context_length': 256,  # context length
    'emb_dim': 768,  # embedding dimension
    'n_layers': 12,  # number of transformer blocks
    'n_heads': 12,  # number of attention heads
    'drop_rate': 0.1,  # dropout rate
    'qkv_bias': False,  # whether to use bias in the query, key, and value weights
}

In [90]:
import torch

torch.manual_seed(456)

model = GPTModel(GPT_CONFIG_124M)
# model.eval()    # this shows the model architecture
model.eval();


In [91]:
import tiktoken
from gpt_helpers import generate_text_simple


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  #adding batch dimension for the current architecture
    return encoded_tensor




In [92]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = text_to_token_ids(start_context, tokenizer)
print(token_ids)

tensor([[6109, 3626, 6100,  345]])


In [93]:
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())


print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you


In [94]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M['context_length']
)

print(token_ids.squeeze(0))

tensor([ 6109,  3626,  6100,   345, 13443, 37191, 25420, 23390, 35735, 37542,
        27518, 24287, 31523,  8074])


In [95]:
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you vague trousers deepest cyclistsurnal rpm discourage northeast breaches Hom


In [96]:
print(token_ids[0, :5])

tensor([ 6109,  3626,  6100,   345, 13443])


# 5.1.2 calculating the text generation loss: Cross entropy  and perplexity

In [97]:
print(text_to_token_ids("every effort moves", tokenizer))
print(text_to_token_ids("I really like", tokenizer))
print(text_to_token_ids(" effort moves you", tokenizer))
print(text_to_token_ids(" really like chocolate", tokenizer))

tensor([[16833,  3626,  6100]])
tensor([[  40, 1107,  588]])
tensor([[3626, 6100,  345]])
tensor([[ 1107,   588, 11311]])


In [98]:
# lets look at some example of logits

inputs = torch.tensor([[16833, 3626, 6100],
                       [40, 1107, 588]])

targets =  torch.tensor([[3626, 6100,  345],
                         [ 1107,   588, 11311]])



In [99]:
with torch.no_grad():
    logits = model(inputs)

print(logits.shape)


torch.Size([2, 3, 50257])


In [100]:
probas = torch.softmax(logits, dim=-1)
probas.shape

torch.Size([2, 3, 50257])

In [101]:
probas

tensor([[[1.5637e-05, 5.7691e-05, 6.2597e-05,  ..., 1.7882e-05,
          1.6705e-05, 1.5272e-05],
         [1.5244e-05, 1.2207e-05, 2.5086e-05,  ..., 1.1783e-05,
          1.5645e-05, 1.5306e-05],
         [1.7312e-05, 6.8765e-05, 1.0261e-05,  ..., 1.9064e-05,
          2.3313e-05, 1.9704e-05]],

        [[1.0964e-05, 2.4615e-05, 3.3088e-05,  ..., 9.7370e-06,
          4.9542e-06, 4.2242e-05],
         [7.6122e-06, 1.5710e-05, 5.3961e-05,  ..., 1.1088e-05,
          1.7431e-05, 1.1257e-05],
         [1.2937e-05, 6.1834e-06, 6.7389e-06,  ..., 1.4765e-05,
          8.9612e-06, 2.8780e-05]]])

In [102]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)
# token_ids_to_text(preds, tokenizer)

Token IDs:
 tensor([[[ 5365],
         [13774],
         [30197]],

        [[41229],
         [31634],
         [ 7710]]])


In [103]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"ouputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
ouputs batch 1:  relatively crying Northwestern


In [104]:
targets[txt_idx]

tensor([ 1107,   588, 11311])

In [105]:
# what iss the probability of the target token for the target ( actually what it should have been )
txt_idx = 0
target_probas1  = probas[txt_idx, [0,1,2], targets[txt_idx]]
print("text 1:", target_probas1)
# ideally we want to increase the above probability score during training

text 1: tensor([2.7467e-05, 6.1473e-05, 1.5830e-05])


In [106]:
# what iss the probability of the target token for the target ( actually what it should have been )
txt_idx = 1
target_probas2  = probas[txt_idx, [0,1,2], targets[txt_idx]]
print("text 2:", target_probas2)

text 2: tensor([3.0948e-05, 2.3746e-05, 8.7063e-06])


In [107]:
# whats the error?
# compute the log probabilities
log_probas  = torch.log(torch.cat((target_probas1, target_probas2)))
print(log_probas)

tensor([-10.5025,  -9.6969, -11.0536, -10.3832, -10.6481, -11.6515])


In [109]:
# negative log likelihood
-1 * torch.mean(log_probas)
# the idea is to make this close to zero

tensor(10.6560)

In [110]:
print(torch.log(torch.tensor([1.0])))
print(torch.log(torch.tensor([0.00000000000003])))

tensor([0.])
tensor([-31.1376])


In [111]:
logits.shape

torch.Size([2, 3, 50257])

In [112]:
logits_flat = logits.flatten(0,1)
logits_flat.shape

torch.Size([6, 50257])

In [113]:
targets_flat = targets.flatten(0,1)
targets_flat.shape

torch.Size([6])

In [None]:
targets_flat = targets.flatten(0,1)
targets_flat.shape


In [114]:
# pytorch shortcut

torch.nn.functional.cross_entropy(logits_flat,targets_flat)

tensor(10.6560)