In [19]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import random
import time
import math
import tiktoken
import inspect
import os


from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

In [20]:
vocab_size = 50304 #50257
batch_size = 2**19
mini_batches = 8
time_stamps = 512
context_len = 1024
emb_neur = 768
epochs = 1
num_blocks = 12
num_heads = 12
# dropout_neur = 0.2
data_dir = "edu_fineweb10B"
log_dir = "log"
checkpoints_frequency = 2000
log_file = os.path.join(log_dir, f"log.txt")
val_log_file = os.path.join(log_dir, f"val_log.txt")

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_lr_steps = 700
weight_decay = 0.1
beta1, beta2 = 0.9, 0.95


enc = tiktoken.get_encoding("gpt2")



ddp = int(os.environ.get('RANK', -1)) != -1 
if ddp:
    init_process_group(backend="nccl")
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0
else:
    ddp_rank = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"device: {device}")
    

torch.manual_seed(1337)
if device == 'cuda':
    torch.cuda.manual_seed(1337)

device: cuda


In [21]:
def get_lr(epoch):
    if epoch < warmup_lr_steps:
        return (max_lr * (epoch+1)/warmup_lr_steps)
    if epoch > epochs:
        return min_lr
    loc = (epoch - warmup_lr_steps)/(epochs - warmup_lr_steps)
    coef = 0.5 * (1.0 + math.cos(math.pi * loc))
    return min_lr + coef * (max_lr - min_lr)

In [22]:
assert batch_size % (mini_batches * time_stamps * ddp_world_size) == 0, "batch_size is not devided by B and T and number_of_gpus"
mini_epochs = int(batch_size / (mini_batches * time_stamps * ddp_world_size)) #number of mini-batches to get 0.5M batch

def load_tokens(filename):
    npt = np.load(filename)
    npt = npt.astype(np.int32) # added after video
    ptt = torch.tensor(npt, dtype=torch.long)
    return ptt

class DataLoader():
    def __init__(self, B, T, cur_process, num_processes, data_dir, split):
        self.B = B
        self.T = T
        self.cur_process = cur_process
        self.cur_shard = 0
        self.num_processes = num_processes
        self.data_dir = data_dir

        shards = os.listdir(self.data_dir)
        shards = [s for s in shards if split in s]
        shards = sorted(shards)
        shards = [os.path.join(self.data_dir, s) for s in shards]
        self.shards = shards

        self.tokens = load_tokens(self.shards[self.cur_shard])
        
        self.current_step = cur_process * B * T

        print(f"loaded ~{len(self.tokens)*len(self.shards)} tokens")


    def reset(self):
        self.cur_shard = 0
        self.tokens = load_tokens(self.shards[self.current_shard])
        self.current_position = self.B * self.T * self.cur_process
        
    def next_batch(self):
        B, T = self.B, self.T
        
        self.current_step += B * T * self.num_processes
        tokens = self.tokens[self.current_step:self.current_step+B*T+1]
        x = (tokens[:-1]).view(B, T)
        y = (tokens[1:]).view(B, T)
        if (self.current_step+B*T* self.num_processes + B*T+1)  > len(self.tokens):
            self.cur_shard = (self.cur_shard+1) % len(self.shards)
            self.tokens = load_tokens(self.shards[self.cur_shard])
            self.current_step = self.cur_process * B * T
        return x, y

In [23]:
class SelfAttention(nn.Module):
    def __init__(self, num_heads):
        super().__init__()
        self.qkv = nn.Linear(emb_neur, 3 * emb_neur)
        self.proj = nn.Linear(emb_neur, emb_neur)
        self.proj.COMES_TO_RESIDUAL = 1
        # self.dropout = nn.Dropout(dropout_neur)

    def forward(self, idx):
        assert emb_neur % num_heads == 0, "Embedding dimension must be divisible by number of heads"

        B, T, C = idx.shape
        qkv = self.qkv(idx)
        q, k, v = qkv.split(emb_neur, dim=2)
        q = q.view(B, T, num_heads, C//num_heads).transpose(1, 2) # B, nh, T, hs
        k = k.view(B, T, num_heads, C//num_heads).transpose(1, 2) # B, nh, T, hs
        v = v.view(B, T, num_heads, C//num_heads).transpose(1, 2) # B, nh, T, hs

        # attention = q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.shape[-1]))
        # attention = torch.tril(attention[:, :, :T, :T])
        
        # attention = attention.masked_fill(attention == 0, float("-inf"))
        # attention = F.softmax(attention, dim=-1)
        # out = attention @ v # B, nh, T, hs 
        

        attention = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        out = attention.transpose(2, 1).contiguous().view(B, T, C)
        out = self.proj(out)
        # out = self.dropout(out)

        return out
        


class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        # self.net = nn.Sequential(
        #     nn.Linear(emb_neur, 4 * emb_neur),
        #     nn.GELU(),
        #     nn.Linear(4 * emb_neur, emb_neur),
        #     nn.Dropout(dropout_neur),
        # )
        self.upl = nn.Linear(emb_neur, 4 * emb_neur)
        self.gelu = nn.GELU()
        self.dwnl = nn.Linear(4 * emb_neur, emb_neur)
        self.dwnl.COMES_TO_RESIDUAL = 1

    def forward(self, idx):
        idx = self.upl(idx)
        idx = self.gelu(idx)
        idx = self.dwnl(idx)
        return idx
        # return self.net(idx)


class Block(nn.Module):
    def __init__(self, num_heads):
        super().__init__()
        self.attentions = SelfAttention(num_heads)
        self.ffn = FeedForward()
        self.ln1 = nn.LayerNorm(emb_neur)
        self.ln2 = nn.LayerNorm(emb_neur)

    def forward(self, idx):
        idx = idx + self.attentions(self.ln1(idx))
        idx = idx + self.ffn(self.ln2(idx))
        return idx

        
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokens_embedding = nn.Embedding(vocab_size, emb_neur)
        self.position_embedding = nn.Embedding(context_len, emb_neur)
        self.blocks = nn.Sequential( *[Block(num_heads) for _ in range(num_blocks)])
        self.ln = nn.LayerNorm(emb_neur)
        self.ll_head = nn.Linear(emb_neur, vocab_size)

        self.tokens_embedding.weight = self.ll_head.weight

        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        std = (1.0 / math.sqrt(emb_neur))
        if isinstance(module, nn.Linear):
            if hasattr(module, "COMES_TO_RESIDUAL"):
                std *= (1.0)/(math.sqrt(2*num_blocks))
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)

    # I have taken this function [configure_optimizers] from Karpathy's nanoGPT
    # https://github.com/karpathy/nanoGPT
    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        embedded_tokens = self.tokens_embedding(idx) # B, T, emb_neur
        embedded_position = self.position_embedding(torch.arange(T, device=device)) # T, emb_neur
        
        idx = embedded_tokens + embedded_position # B, T, emb_neur
        idx = self.blocks(idx)
        idx = self.ln(idx)
        logits = self.ll_head(idx)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            logits, _ = self.forward(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_token), dim=1)
        return idx

In [24]:
torch.set_float32_matmul_precision('high')

m = GPT()
m = m.to(device)
# # m = torch.compile(m)
# #making loss average from all gpus
# if ddp:
#     m = DDP(m, device_ids=[ddp_local_rank]) 
raw_m = m.module if ddp else m

# print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# data_loader = DataLoader(mini_batches, time_stamps, cur_process=ddp_rank, num_processes=ddp_world_size, data_dir=data_dir, split="train")
# val_loader = DataLoader(mini_batches, time_stamps, cur_process=ddp_rank, num_processes=ddp_world_size, data_dir=data_dir, split="val")
# # I have taken this function [configure_optimizers] from Karpathy's nanoGPT
# optmizer = raw_m.configure_optimizers(weight_decay, max_lr, (beta1, beta2), device)
# # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optmizer, T_max=lr_steps, eta_min=min_lr)

DeferredCudaCallError: CUDA call failed lazily at initialization with error: module 'torch' has no attribute 'version'

CUDA call was originally invoked at:

['  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\runpy.py", line 196, in _run_module_as_main\n    return _run_code(code, main_globals, None,\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\runpy.py", line 86, in _run_code\n    exec(code, run_globals)\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\ipykernel_launcher.py", line 18, in <module>\n    app.launch_new_instance()\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\traitlets\\config\\application.py", line 1075, in launch_instance\n    app.start()\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\ipykernel\\kernelapp.py", line 739, in start\n    self.io_loop.start()\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\tornado\\platform\\asyncio.py", line 195, in start\n    self.asyncio_loop.run_forever()\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\asyncio\\base_events.py", line 603, in run_forever\n    self._run_once()\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\asyncio\\base_events.py", line 1909, in _run_once\n    handle._run()\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\asyncio\\events.py", line 80, in _run\n    self._context.run(self._callback, *self._args)\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\ipykernel\\kernelbase.py", line 545, in dispatch_queue\n    await self.process_one()\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\ipykernel\\kernelbase.py", line 534, in process_one\n    await dispatch(*args)\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\ipykernel\\kernelbase.py", line 437, in dispatch_shell\n    await result\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\ipykernel\\ipkernel.py", line 362, in execute_request\n    await super().execute_request(stream, ident, parent)\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\ipykernel\\kernelbase.py", line 778, in execute_request\n    reply_content = await reply_content\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\ipykernel\\ipkernel.py", line 449, in do_execute\n    res = shell.run_cell(\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\ipykernel\\zmqshell.py", line 549, in run_cell\n    return super().run_cell(*args, **kwargs)\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\IPython\\core\\interactiveshell.py", line 3075, in run_cell\n    result = self._run_cell(\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\IPython\\core\\interactiveshell.py", line 3130, in _run_cell\n    result = runner(coro)\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\IPython\\core\\async_helpers.py", line 128, in _pseudo_sync_runner\n    coro.send(None)\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\IPython\\core\\interactiveshell.py", line 3334, in run_cell_async\n    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\IPython\\core\\interactiveshell.py", line 3517, in run_ast_nodes\n    if await self.run_code(code, result, async_=asy):\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\IPython\\core\\interactiveshell.py", line 3577, in run_code\n    exec(code_obj, self.user_global_ns, self.user_ns)\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Temp\\ipykernel_24604\\2621092005.py", line 1, in <module>\n    import torch\n', '  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load\n', '  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked\n', '  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked\n', '  File "<frozen importlib._bootstrap_external>", line 883, in exec_module\n', '  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\__init__.py", line 1146, in <module>\n    _C._initExtension(manager_path())\n', '  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load\n', '  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked\n', '  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked\n', '  File "<frozen importlib._bootstrap_external>", line 883, in exec_module\n', '  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\cuda\\__init__.py", line 197, in <module>\n    _lazy_call(_check_capability)\n', '  File "C:\\Users\\Bogdan\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\torch\\cuda\\__init__.py", line 195, in _lazy_call\n    _queued_calls.append((callable, traceback.format_stack()))\n']

In [None]:
for epoch in range(epochs):
    t0 = time.time()
    last_epoch = epochs - 1
    if epoch % 100 == 0 or epoch == last_epoch:
        m.eval()
        with torch.no_grad():
            val_loss_accum = 0.0
            val_loss_steps = 20
            for _ in range(val_loss_steps):
                x, y = val_loader.next_batch()
                x, y = x.to(device), y.to(device)
                with torch.autocast(device_type=device, dtype=torch.bfloat16):
                    logits, loss = m(x, y)
                loss = loss / val_loss_steps
                val_loss_accum += loss.detach()
        if ddp:
            dist.all_reduce(val_loss_accum, op=dist.ReduceOp.AVG)
        if master_process:
            print(f"Validation loss: {val_loss_accum.item()}")
            with open(val_log_file, "a") as f:
                f.write(f"epoch:{epoch} val_loss:{val_loss_accum.item():.5f}\n")
            if epoch > 0 and (epoch % checkpoints_frequency == 0 or last_epoch):
                checkpoint_path = os.path.join(log_dir, f"model_{epoch:05d}.pt")
                checkpoint = {
                    'model': raw_m.state_dict(),
                    'optimizer':optmizer.state_dict(),
                    'epoch': epoch,
                    'val_loss': val_loss_accum.item()
                }

                torch.save(checkpoint, checkpoint_path)
        
    m.train()
    accumulated_loss = 0.0
    optmizer.zero_grad()

    for mini_epoch in range(mini_epochs):
        x, y = data_loader.next_batch()
        x, y = x.to(device), y.to(device)
    
        with torch.autocast(device_type=device, dtype=torch.bfloat16):
            logits, loss = m(x, y)
        loss /= mini_epochs
        accumulated_loss += loss.detach()
    
        if ddp:
            m.require_backward_grad_sync = (mini_epoch == mini_epochs-1)
        loss.backward()
    if ddp:
        dist.all_reduce(accumulated_loss, op=dist.ReduceOp.AVG)
    
    norm = torch.nn.utils.clip_grad_norm_(m.parameters(), 1.0)
    # scheduler.step()
    lr = get_lr(epoch)
    for param_group in optmizer.param_groups:
        param_group['lr'] = lr
    optmizer.step()
    
    torch.cuda.synchronize()
    t1 = time.time()
    dt = t1-t0

    if master_process and epoch%5==0:
        print(f"epoch: {epoch}, loss: {accumulated_loss:.5f}, norm: {norm:.5f}, time: {dt*1000:.2f}ms, tok/s: {data_loader.B*data_loader.T*mini_epochs*ddp_world_size/dt:.2f}")
        with open(log_file, "a") as f:
            f.write(f"epoch:{epoch} loss:{accumulated_loss.item():.5f}\n")
if ddp:
    destroy_process_group()

In [None]:
enc.decode(m.generate(torch.tensor(enc.encode("Hello")).to(device).view(1, -1), 50)[0].tolist())

In [None]:
# tokens/sec:22406.09
# tokens/sec:45590.02 torch.set_float32_matmul_precision('high')
# tokens/sec:47236.09  with torch.autocast(device_type=device, dtype=torch.bfloat16):
# tokens/sec:63155.71 torch.compile(m)
# tokens/sec:67969.10 flash
# Nice number

# epoch: 49, loss: 6.08617, norm: 0.28814, time: 4674.63ms, tok/s: 112156.04

In [12]:
checkpoint_path = './model_19072.pt'  # Adjust path as needed
m = GPT()
# device = "cpu"
m = m.to(device)
# m = torch.compile(m)
#making loss average from all gpus
if ddp:
    m = DDP(m, device_ids=[ddp_local_rank]) 
raww_m = m.module if ddp else m

# Load checkpoint
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))

## Load checkpoint from file
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))

# Remove "_orig_mod." prefix from the checkpoint keys
state_dict = checkpoint['model']
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith("_orig_mod."):
        new_state_dict[k[len("_orig_mod."):]] = v  # Remove the "_orig_mod." prefix
    else:
        new_state_dict[k] = v

# Load the adjusted state_dict into raww_m
raww_m.load_state_dict(new_state_dict, strict=False)


AttributeError: module 'torch' has no attribute '_utils'

In [26]:
enc.decode(raww_m.generate(torch.tensor(enc.encode("Nowadays")).to(device).view(1, -1), 10)[0].tolist())

'Nowadays, the Board of Directors, the Union of Municip'