In [4]:
import torch
import torch.nn as nn
import tiktoken

from codes.gpt_model import GPTModel
from codes.utils import generate_text_simple

In [5]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "num_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [6]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trans_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiheadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiheadAttention(
        (W_query): Linear(in_fea

In [7]:
total_params = sum(p.numel() for p in model.parameters())
print(total_params)

162419712


In [8]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_text = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

In [9]:
token_ids = generate_text_simple(
    model, 
    idx = text_to_token_ids(start_text, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [10]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves,
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345],    # ["effort moves you",
                        [1107, 588,  11311]]) #  "really like chocolate"]

In [11]:
with torch.no_grad():
    logits = model(inputs)
probs = torch.softmax(logits, dim=-1)
print(probs.shape)

torch.Size([2, 3, 50257])


In [12]:
token_ids = torch.argmax(probs, dim=-1, keepdim=True)
print(token_ids)
print(token_ids.shape)

tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])
torch.Size([2, 3, 1])


In [13]:
print(token_ids_to_text(targets[0], tokenizer))
print(token_ids_to_text(token_ids[0].flatten(), tokenizer))

 effort moves you
 Armed heNetflix


In [14]:
text_idx = 0
target_prob_1 = probs[text_idx, [0,1,2], targets[text_idx]]
print(target_prob_1)

tensor([7.4541e-05, 3.1061e-05, 1.1563e-05])


In [15]:
text_idx = 1
target_prob_2 = probs[text_idx, [0,1,2], targets[text_idx]]
print(target_prob_2)

tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


In [16]:
log_probs = torch.log(torch.cat((target_prob_1, target_prob_2)))

print(log_probs)

tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])


In [17]:
avg_log_probs = torch.mean(log_probs)
print(avg_log_probs)

tensor(-10.7940)


In [18]:
neg_avg_log_probs = avg_log_probs * -1
print(neg_avg_log_probs)

tensor(10.7940)


In [19]:
print(logits.shape)
print(targets.shape)

torch.Size([2, 3, 50257])
torch.Size([2, 3])


In [20]:
logits_flat  = logits.flatten(0, 1)
targets_flat = targets.flatten()
print(logits_flat.shape)
print(targets_flat.shape)

torch.Size([6, 50257])
torch.Size([6])


In [21]:
loss = nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(10.7940)


In [22]:
perplexity = torch.exp(loss)
print(perplexity)

tensor(48725.8203)


In [23]:
with open("ch02/the-verdict.txt", "r", encoding="utf-8") as fin:
    raw_data = fin.read()

total_characters = len(raw_data)
total_tokens = len(tokenizer.encode(raw_data))
print(total_characters)
print(total_tokens)

20479
5145


In [24]:
train_ratio = 0.9
split_idx = int(train_ratio * len(raw_data))
train_data = raw_data[:split_idx]
val_data = raw_data[split_idx:]

In [26]:
from codes.data import create_dataloader_v1

In [27]:
torch.manual_seed(123)
train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M['context_length'],
    stride=GPT_CONFIG_124M['context_length'],
    drop_last=True
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M['context_length'],
    stride=GPT_CONFIG_124M['context_length'],
    drop_last=False,
    shuffle=False
)

In [28]:
print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [29]:
print("Validation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [30]:
from codes.losses import calc_loss_batch, calc_loss_loader

In [31]:
device = torch.device("cpu")
model.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trans_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiheadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiheadAttention(
        (W_query): Linear(in_fea

In [32]:
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.987583372328016
Validation loss: 10.98110580444336
