# reproduce data load in train_gpt2.py

**learnings**

1: a way to add additional attribute to module: self.layer_name.attribute_name = value

2: the last layer bias == False to be the same as pretrained models from HF

3: use classmethod add from_pretrained method

4: a way to inspect input arguments: inspect.signature(torch.optim.AdamW).parameters

5: func(x) whether x is passed by value or the pointer, and how does different operations copy or not copy

example: x = some_value

        def func(x):
            x+= another_value
            x= a + another_value

conclusion: 
- if x is list, np.ndarray, torch.tensor: x is passed as pointer, and += modifies inplace, but x = x+another_value creates a deep copy
- if x is integer, str: x's deep copy is passed

6: changed the change_lr function perc_close_to_max_steps name to perc_close_to_max_steps 


**questions**

1: no dropout in his defined gpt
- none in F.scaled_dot_product_attention which is controlled by dropout_p param and defaults to 0
- none else where still about dropout

2: do we need to use copy_ to copy weights?

3: why only 2d weights uses weight-decay (L2), bias and layer norm do not
- for bias, it captures the average, there is no point in shrinking it
- similar for layer norm

4: grad_accum_steps is for n_iter for each mini_batch, how to determine it

In [1]:
import os 
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
import tiktoken
import math 
import json
import pandas as pd
import inspect

from config import local_dir

In [2]:
# set device: check if cuda or gpu is available
print(f"cuda available: {torch.cuda.is_available()}")
print(f"mps available: {hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()}")

device = "cpu"

def set_seed(seed: int) -> None:
    torch.manual_seed(12345)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed(12345)

cuda available: False
mps available: True


In [3]:
# load gpt to ensure the same naming
from transformers import GPT2LMHeadModel

model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
model_sd  = model_hf.state_dict()

names = [k for k in model_sd.keys()]

def get_distinct_keys(keys: list):

    distinct_keys = set()

    for key in keys:

        key_lst = key.split('.')

        if 'h' in key_lst:
            distinct_keys.add('.'.join(key_lst[3:]))
        elif 'transformer' not in key_lst:
            distinct_keys.add('-' + key)
        else:
            distinct_keys.add(key_lst[1])

    return distinct_keys


get_distinct_keys(names)

{'-lm_head.weight',
 'attn.c_attn.bias',
 'attn.c_attn.weight',
 'attn.c_proj.bias',
 'attn.c_proj.weight',
 'ln_1.bias',
 'ln_1.weight',
 'ln_2.bias',
 'ln_2.weight',
 'ln_f',
 'mlp.c_fc.bias',
 'mlp.c_fc.weight',
 'mlp.c_proj.bias',
 'mlp.c_proj.weight',
 'wpe',
 'wte'}

# 1. model

- learning 1: model.named_parameters() vs model.state_dict()
    - named_paramters() gives an iterator, and it only shows non-redundant set of weights
    - state_dict() is a dict, and it includes all defined weights
    - in gpt2's example, named_parameters has ONE FEWER weight layer because lm_head and wte weights have the same values 

- learning 2: self.apply(self._init_weights)

- learning 3: package logits and loss to be in the same output, consistent with HF gpt2

- learning 4: it is essential to avoid computation graph bloat. any operation that touches the weight, think about need to use no_grad()

- learning 5: when loading pretrained (to copy pretrained gpt weights to the new model instance), has to use weight.copy_(hf_weight) rather than weight = hf_weight. the former keeps REFERENCE intact. VERY IMPORTANT.

In [4]:
@dataclass
class GPTConfig:
    n_embd: int = 768
    n_head: int = 12
    n_layer: int = 2
    vocab_size: int = 50257
    n_positions: int = 1024# max sequence length

In [5]:
@dataclass
class ModelOutput:
    logits: torch.Tensor
    loss: torch.Tensor

In [6]:
# Create Block (Attention + Feed Forward, with residual connection and layer norm)

class CausalAttention(nn.Module):

    def __init__(self, config):
        
        super().__init__()
        self.config = config 

        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, config.n_embd*3)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1  # weights that impact initialization

        self.n_head = config.n_head
        self.n_embd = config.n_embd
    
    def forward(self, x):

        B, T, n_embd = x.shape

        d_q = n_embd // self.n_head

        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim = 2)
        q = q.contiguous().view(B, T, self.n_head, d_q).transpose(1,2)
        k = k.contiguous().view(B, T, self.n_head, d_q).transpose(1,2)
        v = v.contiguous().view(B, T, self.n_head, d_q).transpose(1,2)

        output = F.scaled_dot_product_attention(q, k, v, is_causal = True)
        
        output = output.transpose(1, 2).contiguous().view(B, T, -1)
        output = self.c_proj(output)

        return output

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()

        self.c_fc = nn.Linear(config.n_embd, config.n_embd*4)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(config.n_embd*4, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1 

    def forward(self, x):

        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)

        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)
        

    def forward(self, x):

        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))

        return x

In [7]:
class Transformer(nn.Module):

    def __init__(self, config):

        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.n_positions, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd) # check this
            )
        )

        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False)
        self.transformer.wte.weight = self.lm_head.weight # learning 1: by adding this constraint, the model's named_parameters decrease by 1
        self.apply(self._init_weights) # learning 2: self.apply(self._init_weights)

    @property
    def device(self):
        return next(self.parameters()).device


    def _init_weights(self, module):

        
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0, std=0.02)


    def forward(self, tokens, targets = None):

        B, T = tokens.shape

        token_embd = self.transformer.wte(tokens)
        pos = torch.arange(T, dtype = torch.long, device = tokens.device)
        pos_embd = self.transformer.wpe(pos)

        x = token_embd + pos_embd
        for block in self.transformer.h:
            x = block(x)

        logits = self.lm_head(x)
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T, -1), targets.view(-1))
        
        return ModelOutput(logits = logits, loss=loss) # learning 3: package logits and loss to be in the same output, consistent with HF gpt2
    
    @classmethod
    def from_pretrained(cls, model_type:str):
        """
        load all the weights from pretrained gpt2 model on HF
        """

        model2config = {'gpt2':        dict(n_layer = 12, n_head = 12, n_embd = 768),
                        'gpt2-medium': dict(n_layer = 24, n_head = 16, n_embd = 1024),
                        'gpt2-large':  dict(n_layer = 36, n_head = 20, n_embd = 1280),
                        'gpt2-xl':     dict(n_layer = 48, n_head = 25, n_embd = 7600),
                        }
        config_args = model2config[model_type]
        config_args['vocab_size'] = 50257
        config_args['n_positions'] = 1024

        config = GPTConfig(**config_args)
        model = cls(config) # Orignal code uses class name, which is not as good as cls
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)

        sd_model = model.state_dict() # it is a shallow copy, a new object dict and each key value is also a new object, but each element within each value (tensor) (e.g. sd_model[key][1,1]) is a reference to the orginal dagta
        sd_model_hf = model_hf.state_dict()

        need_to_transpose_weights = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        for param in sd_model:
            if any([param.endswith(x) for x in need_to_transpose_weights]):
                assert sd_model[param].shape == sd_model_hf[param].shape[::-1]
                with torch.no_grad(): # learning 4: it is essential to avoid computation graph bloat. any operation that touches the weight, think about need to use no_grad()
                    sd_model[param].copy_(sd_model_hf[param].T) # learning 5: when loading pretrained (to copy pretrained gpt weights to the new model instance), has to use weight.copy_(hf_weight) rather than weight = hf_weight. the former keeps REFERENCE intact. VERY IMPORTANT.
            else:
                assert sd_model[param].shape == sd_model_hf[param].shape, f'{param} do not share the same shape'
                with torch.no_grad():
                    sd_model[param].copy_(sd_model_hf[param])


        return model
    
    def configure_optimizer(self, learning_rate, weight_decay):

        
        params = {name:p for name,p in self.named_parameters()} # named_parameters is a iterator that imits tuples

        decay_params = [p for _, p in params.items() if p.requires_grad and p.dim() > 1]
        non_decay_params = [p for _, p in params.items() if p.requires_grad and p.dim() == 1]

        optim_groups = [{'params': decay_params, 'weight_decay': weight_decay},
                        {'params': non_decay_params, 'weight_decay': 0.0}
                    ]
        
        # fyi - print out num of params in each group
        num_decay_params = sum([p.numel() for p in decay_params])
        num_non_decay_params = sum([p.numel() for p in non_decay_params])
        print(f"number of decay and non-decay params are: {num_decay_params} and {num_non_decay_params}")

        optimizer = torch.optim.AdamW(optim_groups, lr = learning_rate, betas = (0.9, 0.95), eps = 1e-8)

        return optimizer
    
# test if it works create an instance with the same config as HF gpt2
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
config = GPTConfig()
for key in vars(config):
    setattr(config, key, getattr(gpt2.config, key))

model = Transformer(config).to(device)
encoder = tiktoken.get_encoding('gpt2')
model.configure_optimizer(0.0001, 0.2)

number of decay and non-decay params are: 124318464 and 121344


AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    weight_decay: 0.2

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    weight_decay: 0.0
)

# 2. data loader
same as 2_dataloader_re2

In [8]:
from typing import Tuple
class DataLoader:

    def __init__(self, B:int, T:int, data_dir:str, split:str):

        self.B = B
        self.T = T
        self.data_dir = data_dir
        self.split = split
        self.all_shards_paths = [ os.path.join(data_dir, filename) for filename in os.listdir(data_dir) if split in filename]
        self.reset()

    def reset(self):
        
        self.current_position = 0
        self.current_shard_index = 0
        self.current_shard = _load_tokens(self.all_shards_paths[self.current_shard_index])  # L2

    def next_batch(self) -> Tuple[torch.Tensor, torch.Tensor]:

        delta_n_tokens = self.B*self.T+1 
        batch = self.current_shard[self.current_position: self.current_position + delta_n_tokens]
        x = batch[:-1].view(self.B,self.T)        
        y = batch[1:].view(self.B,self.T)
        self.current_position += delta_n_tokens - 1 # Q1

        # evaluate if need to load next shard
        if self.current_position + delta_n_tokens > len(self.current_shard):
            self.current_shard_index = (self.current_shard_index + 1) % len(self.all_shards_paths)
            self.current_shard = _load_tokens(self.all_shards_paths[self.current_shard_index])
            self.current_position = 0
            print(self.current_shard_index, len(self.all_shards_paths))
        
        return x, y
    
def _load_tokens(filename): # L1

    """function to load tokens from a file and convert to tensor, used in data loader
    """

    data = np.load(filename)
    # data = data.astype(np.int32) # for values strictly within the uint16 range (0–65535), converting directly to torch.long without the intermediate np.int32 works fine.
    data_tensor = torch.tensor(data, dtype=torch.long)

    return data_tensor

B, T = 64, 16*16

train_loader = DataLoader(B, T, local_dir, 'train')
val_loader = DataLoader(B, T, local_dir, 'val')

n_tokens_per_batch = 524288 
assert n_tokens_per_batch % (B*T) == 0
grad_accum_steps = n_tokens_per_batch // (B*T)
grad_accum_steps

32

# 3. evaluation functions
- loss on val
- sentence generation
- hellaswag

In [9]:
# loss on val
def get_loss_on_val(model: nn.Module, val_steps = 20):
    
    model.eval()
    
    val_loss_accum = 0
    for _ in range(val_steps):

        x, y = val_loader.next_batch()
        x, y = x.to(model.device), y.to(model.device) # ! remember to move to the right device
        loss = model(x, y).loss

        loss = loss / val_steps 
        val_loss_accum += loss.detach()

    print(f"loss on val is {val_loss_accum.item()}")

    return val_loss_accum

def write_model_to_file(model, dir, train_step, loss):
    
    output_path = os.path.join(dir, f"model_{train_step: 05d}.pt")
    checkpoint = {'model': model,
                  "config": model.config,
                  "train_step":train_step,
                  'val_loss': loss.item()}
    
    torch.save(checkpoint, output_path)

In [10]:
# sentence generation
@torch.no_grad() # learning 2
def complete_sentence(model, 
                      encoder, 
                      text: str, 
                      n_examples: int = 4, 
                      max_n_generated_tokens: int = 30, 
                      top_k_to_include_in_random_draw: int = 20, 
                      temperature: float = 0.6):
    
    was_training = model.training

    model.eval() # learning 2

    #with torch.no_grad():
        
        # text to tensor
    tokens = encoder.encode_ordinary(text)
    tensor = torch.tensor(tokens).unsqueeze(0).repeat(n_examples, 1).to(model.device) # B * T 

    for _ in range(max_n_generated_tokens):

        # get probability for the next token
        logits = model(tensor).logits[:, -1, :] 
        probs = F.softmax(logits / temperature, dim = -1) # learning 1
        
        # top k samples
        top_probs, top_idx = torch.topk(probs, k = top_k_to_include_in_random_draw, dim=-1)   # B * k
        selected_idx_on_top_probs = torch.multinomial(top_probs, 1) # B * 1
        
        next_tokens = torch.gather(top_idx, -1, selected_idx_on_top_probs) # B * 1
        
        # concat the new token with existing
        tensor = torch.cat([tensor, next_tokens], dim = -1)

    # decode
    decoded = []
    for i in range(n_examples):

        tokens = tensor[i, :].tolist()
        if encoder.eot_token in tokens:
            tokens = tokens[: tokens.index(encoder.eot_token)]
        
        decoded.append(encoder.decode(tokens))

    if was_training:
        model.train()

    for _ in range(len(decoded)):
        print(decoded[i])

    return decoded

_ = complete_sentence(model, encoder, "What makes a person resilient")

What makes a person resilient Stre licence licence StreXX UCHIJ Arkansas info recomm NORottedULT Anthem Anthem uneven identifying NORYepYep Cox BUT TracyZone info grill uneven Slovenia� Neville Anthem
What makes a person resilient Stre licence licence StreXX UCHIJ Arkansas info recomm NORottedULT Anthem Anthem uneven identifying NORYepYep Cox BUT TracyZone info grill uneven Slovenia� Neville Anthem
What makes a person resilient Stre licence licence StreXX UCHIJ Arkansas info recomm NORottedULT Anthem Anthem uneven identifying NORYepYep Cox BUT TracyZone info grill uneven Slovenia� Neville Anthem
What makes a person resilient Stre licence licence StreXX UCHIJ Arkansas info recomm NORottedULT Anthem Anthem uneven identifying NORYepYep Cox BUT TracyZone info grill uneven Slovenia� Neville Anthem


In [11]:
# hellaswag, assuming already downloaded
def iter_hellaswag(split: str = 'val'):
    #download_hellaswag(split)
    with open(f"hellaswag/hellaswag_{split}.jsonl", "r") as f:
        n = 0
        for line in f:
            example = json.loads(line)
            yield example
            n += 1
            if n >= 50:
                break

def render_example(example, encoder):

    context  = example['ctx']
    label = int(example['label'])
    endings = example['endings']

    # create tokens
    context_tokens = encoder.encode_ordinary(context)
    context_len = len(context_tokens)

    masks = []
    tokens = []
    max_len = 0
    for ending in endings:
        ending_tokens = encoder.encode_ordinary(' ' + ending)
        ending_len = len(ending_tokens)
        max_len = max(max_len, context_len + ending_len)

        masks.append([0]* context_len + [1]*ending_len)
        tokens.append(context_tokens + ending_tokens)
     
    # convert to padded tensors
    padded_masks = torch.zeros((4, max_len), dtype=torch.long)
    padded_tokens = torch.zeros((4, max_len), dtype =torch.long)

    for i in range(4):
        curr_len = len(tokens[i])
        padded_masks[i, :curr_len] = torch.tensor(masks[i])
        padded_tokens[i, :curr_len] = torch.tensor(tokens[i])

    return padded_tokens, padded_masks, label

@torch.no_grad()
def eval_hellaswag(iterator, model, encoder):

    was_training = model.training
    model.eval()

    num_correct_norm = 0
    num_correct = 0
    num_total = 0

    for example in iterator:
        tokens, masks, label = render_example(example, encoder)

        tokens = tokens.to(model.device)
        x = tokens[:, :-1].contiguous()
        y = tokens[:, 1:].contiguous()         
        masks = masks.to(model.device)[:, 1:]  # B * T-1
        B = x.shape[0]

        # get prob
        logits = model(x).logits

        losses = F.cross_entropy(logits.view(-1, logits.shape[-1]), y.contiguous().view(-1), reduction='none').view(B, -1)

        masked_losses = losses * masks 
        
        total_losses = masked_losses.sum(dim =-1)
        avg_losses = total_losses / masks.sum(dim=-1)
        
        # eval if accurate
        num_correct += total_losses.argmin().item() == label 
        num_correct_norm += avg_losses.argmin().item() == label 
        num_total += 1

    print(f"evaluated {num_total} examples: {num_correct_norm} correct using avg prob. {num_correct} correct using total prob")

    if was_training:
        model.train()

    return num_correct_norm, num_correct, num_total

# 4. Train

In [12]:
# torch.set_float32_matmul_precision('high') # only relevant for subset of GPUs
model
os.makedirs('model', exist_ok=True)

# learning rate update
max_lr, min_lr  = 6e-4, 6e-5
warm_up_steps, max_steps = 715, 19073 # 19,073 steps is ~1 epoch, if data is 10B tokens and batch size 0.5M tokens

def get_learning_rate(step):

    if step < warm_up_steps:
        lr = max_lr * (step + 1)/warm_up_steps
    elif step > max_steps:
        lr = min_lr 
    else:
        decay_ratio = (step - warm_up_steps) / (max_steps - warm_up_steps)
        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
        lr = min_lr + coeff * (max_lr - min_lr)

    return lr

In [13]:

optimizer = model.configure_optimizer(learning_rate = max_lr, weight_decay = 0.1)


max_train_step = 10
hellaswag_iterator = iter_hellaswag()

for step in range(max_train_step):
    
    # eval
    if step % 2 == 0 or step == max_train_step:

        print(f"eval results on step {step}")
        
        # loss on val
        loss_on_val = get_loss_on_val(model, val_steps = 20)
        write_model_to_file(model, 'model', step, loss_on_val)
        
        # sentence generation
        _ = complete_sentence(model, encoder, 'what makes a person resilient')

        # hellaswage evaluation
        eval_hellaswag(hellaswag_iterator, model, encoder)
    
    # train
    model.train()
    optimizer.zero_grad()
    loss = 0

    for mini_batch in range(grad_accum_steps):

        x, y = train_loader.next_batch()

        output = model(x, y)
        logits, mini_batch_loss = output.logits, output.loss
        mini_batch_loss = mini_batch_loss / grad_accum_steps

        loss += mini_batch_loss.detach()

        mini_batch_loss.backward()   

    # update learning rate and gradient
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    lr = get_learning_rate(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    
    print(f"step {step}: loss is {loss}")
    optimizer.step()

    # TODO add learning rate adpatation and gradiant clipping

number of decay and non-decay params are: 124318464 and 121344
eval results on step 0
loss on val is 11.26953125
what makes a person resilientign PSUmicrosoftmicrosoft StreAnything Affect trespass rive PSU licence IST uneven surprisinglyPRO rout rigs PSU prisonerolution closes Emil IST introductory bigotnian 364Point bigot augmented
what makes a person resilientign PSUmicrosoftmicrosoft StreAnything Affect trespass rive PSU licence IST uneven surprisinglyPRO rout rigs PSU prisonerolution closes Emil IST introductory bigotnian 364Point bigot augmented
what makes a person resilientign PSUmicrosoftmicrosoft StreAnything Affect trespass rive PSU licence IST uneven surprisinglyPRO rout rigs PSU prisonerolution closes Emil IST introductory bigotnian 364Point bigot augmented
what makes a person resilientign PSUmicrosoftmicrosoft StreAnything Affect trespass rive PSU licence IST uneven surprisinglyPRO rout rigs PSU prisonerolution closes Emil IST introductory bigotnian 364Point bigot augmented