# Pretraining the model and text generation

Using unlabed data to train a new model

In [10]:
import torch
import myllm.util
import myllm.gpt as gpt
import tiktoken

In [11]:
# to reduce complexity for trainig
gpt_config = gpt.GPT_CONFIG_124M
gpt_config["context_length"] = 256 # reduced to faster training

torch.manual_seed(123)
model = gpt.GPTModel(gpt.GPT_CONFIG_124M)

model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [12]:
# methods will be taken to gpt

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [13]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

tid = text_to_token_ids(start_context, tokenizer)

token_ids = myllm.util.generate_text_simple(
    model= model,
    idx= tid,
    max_new_tokens=10,
    context_size=gpt_config["context_length"]
)

print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you rentingetic minion mobilizedYang outlined Warhammer Lead nylonestones


## Assessing the quality opf the generated text

In [14]:
# calculating loss
torch.manual_seed(123) 

#assuming the token IDs
inputs = torch.tensor([
    [16833, 3626, 6100], # every effort moves
    [40, 1107, 588]      # I really like 
])

# we expect the results to be
targets = torch.tensor([
    [3626, 6100, 345], # effort moves you
    [1107, 588, 11311] # really like chocolate
])

# Calculating loss by comparing the model results with the expected target
with torch.no_grad():
    logits = model(inputs)

# softmax transforms a vector into probability distribution
probas = torch.softmax(logits, dim=-1)
print(probas.shape)

# argmax to obtain the corresponding token ids
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print(token_ids)

torch.Size([2, 3, 50257])
tensor([[[36195],
         [16031],
         [42826]],

        [[14212],
         [ 7822],
         [ 2086]]])


In [15]:
#convert back to text

print(f"Target batch: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Output batch: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Target batch:  effort moves you
Output batch: lif savesNetflix


In [16]:
text_idx = 0
target_probas_1 = probas[text_idx, [0,1,2], targets[text_idx]]
print(target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0,1,2], targets[text_idx]]
print(target_probas_2)

tensor([4.0949e-05, 1.7552e-05, 1.1681e-05])
tensor([1.2474e-05, 4.3171e-05, 5.3395e-06])


In [17]:
#using log probabilities (better math)
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

avg_log_probas = torch.mean(log_probas)
print(avg_log_probas) # closest to ZERO is the goal

#cross entropy
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)


tensor([-10.1032, -10.9503, -11.3576, -11.2919, -10.0503, -12.1404])
tensor(-10.9823)
tensor(10.9823)


In [18]:
print("logits shape", logits.shape)
print("targets shape", targets.shape)

# To use the Cross Entropy Loss function we need to flatten the tensors
logits_flat = logits.flatten(0,1) 
targets_flat = targets.flatten()

print("flattened logis", logits_flat.shape)
print("flattened target", targets_flat.shape)

# Cross Entropy Loss: difference between 2 prob distributions
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

logits shape torch.Size([2, 3, 50257])
targets shape torch.Size([2, 3])
flattened logis torch.Size([6, 50257])
flattened target torch.Size([6])
tensor(10.9823)


In [19]:

#perplexity: Measurement of how well the prob distribution matches
# the distribution of words in the dataset. Lower is good
perplexity = torch.exp(loss)
print(perplexity)
# the result means the model is unsure about which among 58,822 tokens
#  in the vocabulary to generate as the next token

tensor(58822.5586)
