In [1]:
import mlx.core as mx
import mlx.nn as nn

# Evaluating text model

In [2]:
from previous_chapters import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256, 
    "emb_dim": 768,
    "n_heads": 12, 
    "n_layers": 12,
    "drop_rate": 0.1, 
    "qkv_bias": False,
}

mx.random.seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(50257, 768)
  (drop_emb): Dropout(p=0.09999999999999998)
  (trf_blocks): Sequential(
    (layers.0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(input_dims=768, output_dims=768, bias=False)
        (W_key): Linear(input_dims=768, output_dims=768, bias=False)
        (W_value): Linear(input_dims=768, output_dims=768, bias=False)
        (out_proj): Linear(input_dims=768, output_dims=768, bias=True)
        (dropout): Dropout(p=0.09999999999999998)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (layers.0): Linear(input_dims=768, output_dims=3072, bias=True)
          (layers.1): GELU()
          (layers.2): Linear(input_dims=3072, output_dims=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.09999999999999998)
    )
    (layers.1): TransformerBlock(
      (att): MultiHeadAttention(
        (

In [9]:
import tiktoken
from previous_chapters import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = mx.array(encoded)[None, :]
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze().tolist()
    return tokenizer.decode(flat)

In [4]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"],
)
token_ids_to_text(token_ids, tokenizer)

'Every effort moves you rational ErraterasuOregonOregon Stevenson bur StevensonOregonShot'

In [5]:
inputs = mx.array([[16833, 3626, 6100],   # ["every effort moves",
                   [40, 1107, 588]])      #  "I really like"]
targets = mx.array([[3626, 6100, 345],    # [" effort moves you"
                    [1107, 588, 11311]])  #  " really like chocolate"]

In [6]:
logits = model(mx.stop_gradient(inputs))
probas = nn.softmax(logits, axis=-1)
probas, probas.shape

(array([[[2.4659e-05, 7.09139e-06, 5.94067e-05, ..., 8.50546e-06, 3.1108e-05, 3.1253e-05],
         [2.19308e-05, 7.08788e-06, 4.74957e-05, ..., 9.22388e-06, 1.9442e-05, 1.9591e-05],
         [1.83429e-05, 8.42821e-06, 4.95861e-05, ..., 9.82874e-06, 2.17547e-05, 1.95116e-05]],
        [[2.73172e-05, 9.37263e-06, 1.38779e-05, ..., 2.61956e-05, 4.36025e-05, 7.52438e-06],
         [2.4238e-05, 8.87468e-06, 1.3109e-05, ..., 3.36475e-05, 3.15629e-05, 7.75805e-06],
         [2.17332e-05, 8.64109e-06, 1.32396e-05, ..., 3.29413e-05, 2.869e-05, 8.55843e-06]]], dtype=float32),
 (2, 3, 50257))

In [7]:
token_ids = mx.argmax(probas, axis=-1, keepdims=True)
token_ids

array([[[1158],
        [40697],
        [17149]],
       [[30526],
        [29643],
        [46678]]], dtype=uint32)

In [10]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")


Targets batch 1:  effort moves you
Outputs batch 1: ves paed icons


In [12]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: array([1.26521e-05, 6.10094e-05, 9.74481e-06], dtype=float32)
Text 2: array([8.53397e-06, 1.31845e-05, 1.62757e-05], dtype=float32)


In [None]:
# we want to maximize the log probability of the targets
log_probas = mx.log(mx.concat([target_probas_1, target_probas_2], axis=0))
log_probas

array([-11.2777, -9.70448, -11.5388, -11.6715, -11.2365, -11.0258], dtype=float32)

In [14]:
avg_log_probas = mx.mean(log_probas)
avg_log_probas

array(-11.0758, dtype=float32)

In [None]:
# in training, we minimize the negative log likelihood
neg_avg_log_probas = -avg_log_probas
neg_avg_log_probas

array(11.0758, dtype=float32)

In [20]:
logits.flatten(0, 1).shape, targets.flatten().shape

((6, 50257), (6,))

In [25]:
loss = nn.losses.cross_entropy(logits.flatten(0, 1), targets.flatten(),
                               reduction="mean")
loss

array(11.0758, dtype=float32)

In [27]:
# perplexity is exp of the cross-entropy loss
perplexity = mx.exp(loss)
perplexity

array(64588.1, dtype=float32)