In [1]:

%load_ext autoreload
%autoreload 2

In [2]:
import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim

# Evaluating text model

In [3]:
from previous_chapters import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256, 
    "emb_dim": 768,
    "n_heads": 12, 
    "n_layers": 12,
    "drop_rate": 0.1, 
    "qkv_bias": False,
}

mx.random.seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(50257, 768)
  (drop_emb): Dropout(p=0.09999999999999998)
  (trf_blocks): Sequential(
    (layers.0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(input_dims=768, output_dims=768, bias=False)
        (W_key): Linear(input_dims=768, output_dims=768, bias=False)
        (W_value): Linear(input_dims=768, output_dims=768, bias=False)
        (out_proj): Linear(input_dims=768, output_dims=768, bias=True)
        (dropout): Dropout(p=0.09999999999999998)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (layers.0): Linear(input_dims=768, output_dims=3072, bias=True)
          (layers.1): GELU()
          (layers.2): Linear(input_dims=3072, output_dims=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.09999999999999998)
    )
    (layers.1): TransformerBlock(
      (att): MultiHeadAttention(
        (

In [4]:
import tiktoken
from previous_chapters import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = mx.array(encoded)[None, :]
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze().tolist()
    return tokenizer.decode(flat)

In [5]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"],
)
token_ids_to_text(token_ids, tokenizer)

'Every effort moves you rational ErraterasuOregonOregon Stevenson bur StevensonOregonShot'

## Calculating the text generation loss: cross-entropy and perplexity

In [6]:
inputs = mx.array([[16833, 3626, 6100],   # ["every effort moves",
                   [40, 1107, 588]])      #  "I really like"]
targets = mx.array([[3626, 6100, 345],    # [" effort moves you"
                    [1107, 588, 11311]])  #  " really like chocolate"]

In [7]:
logits = model(mx.stop_gradient(inputs))
probas = nn.softmax(logits, axis=-1)
probas, probas.shape

(array([[[2.4659e-05, 7.09139e-06, 5.94067e-05, ..., 8.50546e-06, 3.1108e-05, 3.1253e-05],
         [2.19308e-05, 7.08788e-06, 4.74957e-05, ..., 9.22388e-06, 1.9442e-05, 1.9591e-05],
         [1.83429e-05, 8.42821e-06, 4.95861e-05, ..., 9.82874e-06, 2.17547e-05, 1.95116e-05]],
        [[2.73172e-05, 9.37263e-06, 1.38779e-05, ..., 2.61956e-05, 4.36025e-05, 7.52438e-06],
         [2.4238e-05, 8.87468e-06, 1.3109e-05, ..., 3.36475e-05, 3.15629e-05, 7.75805e-06],
         [2.17332e-05, 8.64109e-06, 1.32396e-05, ..., 3.29413e-05, 2.869e-05, 8.55843e-06]]], dtype=float32),
 (2, 3, 50257))

In [8]:
token_ids = mx.argmax(probas, axis=-1, keepdims=True)
token_ids

array([[[1158],
        [40697],
        [17149]],
       [[30526],
        [29643],
        [46678]]], dtype=uint32)

In [9]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")


Targets batch 1:  effort moves you
Outputs batch 1: ves paed icons


In [10]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: array([1.26521e-05, 6.10094e-05, 9.74481e-06], dtype=float32)
Text 2: array([8.53397e-06, 1.31845e-05, 1.62757e-05], dtype=float32)


In [11]:
# we want to maximize the log probability of the targets
log_probas = mx.log(mx.concat([target_probas_1, target_probas_2], axis=0))
log_probas

array([-11.2777, -9.70448, -11.5388, -11.6715, -11.2365, -11.0258], dtype=float32)

In [12]:
avg_log_probas = mx.mean(log_probas)
avg_log_probas

array(-11.0758, dtype=float32)

In [13]:
# in training, we minimize the negative log likelihood
neg_avg_log_probas = -avg_log_probas
neg_avg_log_probas

array(11.0758, dtype=float32)

In [14]:
logits.flatten(0, 1).shape, targets.flatten().shape

((6, 50257), (6,))

In [15]:
loss = nn.losses.cross_entropy(logits.flatten(0, 1), targets.flatten(),
                               reduction="mean")
loss

array(11.0758, dtype=float32)

In [16]:
# perplexity is exp of the cross-entropy loss
perplexity = mx.exp(loss)
perplexity

array(64588.1, dtype=float32)

## Calculating the training and validation set losses

In [17]:
with open("../ch02/the-verdict.txt", "r") as f:
    text_data = f.read()

In [18]:
text_data[:99], text_data[-99:]

('I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no ',
 'it for me! The Strouds stand alone, and happen once--but there\'s no exterminating our kind of art."')

In [19]:
total_characters, total_tokens = len(text_data), len(tokenizer.encode(text_data))
total_characters, total_tokens

(20479, 5145)

In [20]:
from previous_chapters import gpt_dataset_v1
tokenizer = tiktoken.get_encoding("gpt2")

train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

mx.random.seed(123)
train_loader = gpt_dataset_v1(
    train_data,
    tokenizer,
    GPT_CONFIG_124M["context_length"],
    GPT_CONFIG_124M["context_length"],
    batch_size=2,
    shuffle=True,
)
val_loader = gpt_dataset_v1(
    val_data,
    tokenizer,
    GPT_CONFIG_124M["context_length"],
    GPT_CONFIG_124M["context_length"],
    batch_size=2,
    shuffle=True,
)

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the trainig loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")
    
if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")
print("Train loader:")

train_tokens = 0
for ex in iter(train_loader[0]):
    print(ex['input_ids'].shape, ex['target_ids'].shape)
    train_tokens += ex['input_ids'].size

val_tokens = 0
print("\nValidation loader:")
for ex in iter(val_loader[0]):
    print(ex['input_ids'].shape, ex['target_ids'].shape)
    val_tokens += ex['input_ids'].size

train_tokens, val_tokens

Train loader:
(2, 256) (2, 256)
(2, 256) (2, 256)
(2, 256) (2, 256)
(2, 256) (2, 256)
(2, 256) (2, 256)
(2, 256) (2, 256)
(2, 256) (2, 256)
(2, 256) (2, 256)
(2, 256) (2, 256)

Validation loader:
(2, 256) (2, 256)


(4608, 512)

In [21]:
def compute_ce_loss(logits, target):
    loss = nn.losses.cross_entropy(logits.flatten(0, 1), target.flatten(), 
                                   reduction="mean")
    return loss

def calc_loss_batch(input_batch, target_batch, model, no_grad, device=None):
    input_batch, target_batch = mx.array(input_batch), mx.array(target_batch, dtype=mx.int32)

    if no_grad:
        input_batch = mx.stop_gradient(input_batch)
    logits = model(input_batch)
    loss, grad = nn.value_and_grad(model, compute_ce_loss)(logits, target_batch)
    return loss, grad

def calc_loss_loader(data_loader, model, no_grad, device=None, num_batches=None):
    total_loss = 0.
    if data_loader[1] == 0:
        return float('nan')
    elif num_batches is None:
        num_batches = data_loader[1]
    else:
        # reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, data_loader[1])
    for i, ex in enumerate(iter(data_loader[0])):
        if i < num_batches:
            loss, _ = calc_loss_batch(ex['input_ids'], ex['target_ids'], model, no_grad, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches    

In [22]:
mx.random.seed(123)
# TODO: shouldn't require to start the dataloader again
train_loader = gpt_dataset_v1(
    train_data,
    tokenizer,
    GPT_CONFIG_124M["context_length"],
    GPT_CONFIG_124M["context_length"],
    batch_size=2,
    shuffle=True,
)
val_loader = gpt_dataset_v1(
    val_data,
    tokenizer,
    GPT_CONFIG_124M["context_length"],
    GPT_CONFIG_124M["context_length"],
    batch_size=2,
    shuffle=True,
)

train_loss = calc_loss_loader(train_loader, model, True)
val_loss = calc_loss_loader(val_loader, model, True)
train_loss, val_loss

(5.531623946295844, 5.520528793334961)

# Training an LLM

In [23]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, 
                       eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, 0

    for epoch in range(num_epochs):
        # TODO: shouldn't require to start the dataloader again
        train_loader = gpt_dataset_v1(
            train_data,
            tokenizer,
            GPT_CONFIG_124M["context_length"],
            GPT_CONFIG_124M["context_length"],
            batch_size=2,
            shuffle=True,
        )        
        model.train()
        for ex in iter(train_loader[0]):
            input_batch, target_batch = ex['input_ids'], ex['target_ids']
            loss, grad = calc_loss_batch(input_batch, target_batch, model, False)
            optimizer.update(model, grad)
            # Force a graph evaluation
            mx.eval(model.parameters(), optimizer.state)
            train_loss = loss.item()
            train_losses.append(train_loss)

            tokens_seen += input_batch.size
            global_step += 1

            if global_step % eval_freq == 0:
                val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                # train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch {epoch+1}, step {global_step:06d}: "
                      f"train loss {train_loss:.3f}, val loss {val_loss:.3f}")
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    val_loader = gpt_dataset_v1(
        val_data,
        tokenizer,
        GPT_CONFIG_124M["context_length"],
        GPT_CONFIG_124M["context_length"],
        batch_size=2,
        shuffle=True,
    )
    # train_loss = calc_loss_loader(train_loader, model, True, device, eval_iter)
    val_loss = calc_loss_loader(val_loader, model, True, device, eval_iter)
    model.train()
    return val_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer)
    token_ids = generate_text_simple(
        model=model,
        idx=mx.stop_gradient(encoded),
        max_new_tokens=50,
        context_size=context_size,
    )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

In [24]:
mx.random.seed(123)
model = GPTModel(GPT_CONFIG_124M)
optimizer = optim.AdamW(learning_rate=4e-4, weight_decay=1e-1)

# TODO: shouldn't require to start the dataloader again
train_loader = gpt_dataset_v1(
    train_data,
    tokenizer,
    GPT_CONFIG_124M["context_length"],
    GPT_CONFIG_124M["context_length"],
    batch_size=2,
    shuffle=True,
)
val_loader = gpt_dataset_v1(
    val_data,
    tokenizer,
    GPT_CONFIG_124M["context_length"],
    GPT_CONFIG_124M["context_length"],
    batch_size=2,
    shuffle=True,
)


num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, None, 
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer
)

Epoch 1, step 000005: train loss 11.018, val loss 5.520
Every effort moves you rational ErraterasuOregonOregon Stevenson bur StevensonOregonShotaterasuShotaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasu BRE hrs hrsaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasu BRE
Epoch 2, step 000010: train loss 11.047, val loss 5.520
Epoch 2, step 000015: train loss 11.034, val loss 5.520
Every effort moves you rational ErraterasuOregonOregon Stevenson bur StevensonOregonShotaterasuShotaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasu BRE hrs hrsaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasuaterasu BRE
Epoch 3, step 000020: train loss 11.038, val loss 5.520
Epoch 3, step 000025: train loss 11.043, val loss 5.520
Every effort