In [1]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
import inspect
import time
import numpy as np
import os
import math
import tiktoken

In [3]:
class MultiHeadAttention(nn.Module):

    def __init__(self, config):
        super(MultiHeadAttention, self).__init__()
        # Projection of inputs so that we can get key, query and value
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # Output Linear Layer
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # For scaling during initialization
        self.c_proj.NANOGPT_SCALE_INIT = 1
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size()  # batch size, sequence length, embedding dimension

        qkv = self.c_attn(x)  # Performing forward pass to get concatenated output of query, key and value
        q, k, v = qkv.split(self.n_embd, dim=-1)  # Splitting into query, key and value
        # Creating a number of heads dimension in between
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        # Apply attention mechanism
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        # Forward pass on the output layer
        y = self.c_proj(y)
        return y


class MLP(nn.Module):

    def __init__(self, config):
        super(MLP, self).__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):

    def __init__(self, config):
        super(Block, self).__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = MultiHeadAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257  # Vocabulary size (256 byte tokens + 50000 BPE merges + <|endoftext|> token
    n_layer: int = 12  # Number of Block Layers
    n_head: int = 12  # Number of Heads
    n_embd: int = 768  # Embedding dimension


class GPT(nn.Module):

    def __init__(self, config):
        super(GPT, self).__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.n_embd),
            wpe=nn.Embedding(config.block_size, config.n_embd),
            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f=nn.LayerNorm(config.n_embd)
        ))

        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and position embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)  # shape (T)
        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layer norm
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)
        loss = None
        if targets is not None:  # Softmax is applied explicitly in F.cross_entropy loss function
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2': dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024),  # 350M params
            'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280),  # 774M params
            'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600),  # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257  # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024  # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]  # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]  # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]  # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, device_type, master_process=True):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        if master_process:
            print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
            print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        if master_process:
            print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

In [4]:
class DataLoader:

    def __init__(self, B, T):
        self.B = B
        self.T = T
        # Loading Tokens
        tokens = np.load('/kaggle/input/fineweb-100m/tokenized_100M.npy')
        self.tokens = torch.tensor(tokens, dtype=torch.long)
        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position:self.current_position + B * T + 1]
        x = (buf[:-1]).view(B, T)
        y = (buf[1:]).view(B, T)

        self.current_position += B * T

        if self.current_position + B * T >= len(self.tokens):
            self.current_position = 0

        return x, y

In [5]:
total_batch_size = 524288
B = 8 # mini batch size
T = 1024 # max sequence length
grad_accum_steps = total_batch_size // (B * T)
print(f"total desired batch size : {total_batch_size}")
print(f"=> calculated gradient accumulated steps : {grad_accum_steps}")

train_loader = DataLoader(B, T)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GPT(GPTConfig())
model.to(device)
# model = torch.compile(model) Needed GPU A100 or latest (Kaggle GPUs do not work)

torch.set_float32_matmul_precision('high')

optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device.type)

total desired batch size : 524288
=> calculated gradient accumulated steps : 64
num decayed parameter tensors: 50, with 124,318,464 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
using fused AdamW: True


In [6]:
# Directory to save logs
log_dir = "log"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
log_file = os.path.join(log_dir, f"log.txt")

# Directory to save checkpoints
checkpoint_dir = "checkpoints"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    
# Function to save a checkpoint
def save_checkpoint(step, model, optimizer, loss_acc, checkpoint_dir):
    checkpoint_path = os.path.join(checkpoint_dir, f"gpt2_checkpoint_step_{step}.pt")
    torch.save({
        'step': step,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss_acc,
    }, checkpoint_path)
    print(f"Checkpoint saved at step {step}")

In [7]:
max_steps = 200

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 50


def get_lr(step):
    if step < warmup_steps:
        return max_lr * (step + 1) / warmup_steps
    elif step > max_steps:
        return min_lr
    decay_ratio = (step - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

In [8]:
for step in range(max_steps):
    t0 = time.time()
    optimizer.zero_grad()
    loss_acc = 0.0
    for micro_step in range(grad_accum_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        # with torch.autocast(device_type=device.type, dtype=torch.float16):
        #     logits, loss = model(x, y) Needed GPU A100 (Kaggle GPUs do not work)
        logits, loss = model(x, y)
        loss = loss / grad_accum_steps
        loss_acc += loss.detach()
        loss.backward()
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    # Get learning rate
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    # torch.cuda.synchronize()
    t1 = time.time()
    dt = (t1 - t0) * 1000
    tokens_per_seconds = (train_loader.B * train_loader.T * grad_accum_steps) / (t1 - t0)
    print(
        f'Step {step}, Loss: {loss_acc.item():.4f}, Norm : {norm}, dt : {dt:.2f}ms, tokens/sec : {tokens_per_seconds:.2f}s')
    with open(log_file, 'a') as f:
        f.write(f"{step} train {loss_acc.item():.6f}\n")

    if (step + 1) % 25 == 0:
        save_checkpoint(step, model, optimizer, loss_acc, checkpoint_dir)

Step 0, Loss: 10.9881, Norm : 15.223847389221191, dt : 69367.71ms, tokens/sec : 7558.10s
Step 1, Loss: 10.4542, Norm : 9.434684753417969, dt : 67871.64ms, tokens/sec : 7724.70s
Step 2, Loss: 10.0339, Norm : 5.908762454986572, dt : 67925.67ms, tokens/sec : 7718.55s
Step 3, Loss: 9.7504, Norm : 3.7307612895965576, dt : 67913.06ms, tokens/sec : 7719.99s
Step 4, Loss: 9.6447, Norm : 2.669076919555664, dt : 67970.16ms, tokens/sec : 7713.50s
Step 5, Loss: 9.5652, Norm : 2.356815814971924, dt : 67994.51ms, tokens/sec : 7710.74s
Step 6, Loss: 9.4975, Norm : 2.2452168464660645, dt : 67957.24ms, tokens/sec : 7714.97s
Step 7, Loss: 9.4421, Norm : 2.147087335586548, dt : 67952.32ms, tokens/sec : 7715.53s
Step 8, Loss: 9.3399, Norm : 2.106595993041992, dt : 67958.68ms, tokens/sec : 7714.81s
Step 9, Loss: 9.2240, Norm : 2.0408270359039307, dt : 67894.13ms, tokens/sec : 7722.14s
Step 10, Loss: 9.0883, Norm : 2.02877140045166, dt : 67965.41ms, tokens/sec : 7714.04s
Step 11, Loss: 9.0276, Norm : 1.8588

In [9]:
# Define hyperparameters
num_return_sequences = 5
max_length = 30


# Load the encoder
enc = tiktoken.get_encoding('gpt2')

# Encode the input text
tokens = enc.encode("Hello,")
tokens = torch.tensor(tokens, dtype=torch.long)

# Prepare the initial input
x = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
x = x.to(device)  # Assuming device is defined elsewhere

# Generate text loop
while x.size(1) < max_length:
    with torch.no_grad():
        # Get logits and probabilities
        logits, _ = model(x)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)

        # Sample top k tokens
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)

        # Sample and update input sequence
        ix = torch.multinomial(topk_probs, num_samples=1)
        xol = torch.gather(topk_indices, dim=-1, index=ix)
        x = torch.cat([x, xol], dim=1)

# Decode and print generated text
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    text = enc.decode(tokens)
    print(text)

Hello, one of all the long term (a) of these children have been different students in our kids are no children and in mind in this child
Hello, we don-up, in my people have the future of this.
A�s, or I have to make some of what in
Hello, or other time. There is an excellent name, in many important enough-like people who need to be much of the time to all to
Hello, the key-t).
-
- The study? However, or
- If we have to use, is. This makes sense
Hello, so just
If I think about 1, I think I's one’s going on that’m going to a little.
