In [None]:
# Step 1: Clone the NanoGPT repository and install necessary dependencies

# Clone the NanoGPT repository
!git clone https://github.com/karpathy/nanoGPT.git
%cd nanoGPT

# Install required packages
!pip install transformers datasets tiktoken wandb tqdm numpy torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

In [2]:
from datasets import load_dataset

# Load the enwik8 dataset
dataset = load_dataset("LTCB/enwik8", split="train")

# Extract the raw text from the dataset
raw_text = ''.join(dataset['text'])

# Calculate the total number of characters in the dataset
total_characters = len(raw_text)
print(f"Total number of characters in the dataset: {total_characters}")

# Define the number of characters for training
train_character_limit = 90_000_000

# Ensure sufficient data is available for training
if total_characters < train_character_limit:
    raise ValueError(f"Insufficient data for training. Only {total_characters} characters available.")

# Determine the remaining characters for validation and testing
remaining_characters = total_characters - train_character_limit

# Define validation character size (maximum of 5 million)
validation_size = min(5_000_000, remaining_characters)

# Use the rest for testing
test_size = remaining_characters - validation_size

# Split the text into training, validation, and test sets
train_data = raw_text[:train_character_limit]
validation_data = raw_text[train_character_limit:train_character_limit + validation_size]
test_data = raw_text[train_character_limit + validation_size:]

# Save each split to a separate file
with open('train.txt', 'w') as train_file:
    train_file.write(train_data)
with open('valid.txt', 'w') as valid_file:
    valid_file.write(validation_data)
with open('test.txt', 'w') as test_file:
    test_file.write(test_data)

print("Data successfully split and saved as train.txt, valid.txt, and test.txt")

# Verify the sizes of the splits
print(f"Training characters: {len(train_data)}")
print(f"Validation characters: {len(validation_data)}")
print(f"Test characters: {len(test_data)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


enwik8.py:   0%|          | 0.00/2.94k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.28k [00:00<?, ?B/s]

The repository for LTCB/enwik8 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/LTCB/enwik8.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/36.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1128024 [00:00<?, ? examples/s]

Total number of characters in the dataset: 97492430
Data successfully split and saved as train.txt, valid.txt, and test.txt
Training characters: 90000000
Validation characters: 5000000
Test characters: 2492430


In [3]:
# Step 3: Prepare the Data for NanoGPT

# Create a directory for the dataset
!mkdir -p data/enwik8

# Move the data files into the dataset directory
!mv train.txt valid.txt test.txt data/enwik8/

In [4]:
# Create a new script called prepare_enwik8.py

%%writefile data/prepare_enwik8.py
import os
import pickle
import numpy as np

# Define the data directory where the files will be stored
data_dir = 'data/enwik8'

# Safely read the text files, ensuring they exist
def read_file_safe(filename):
    filepath = os.path.join(data_dir, filename)
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File {filename} not found in {data_dir}")
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

# Load the dataset splits
train_data = read_file_safe('train.txt')
val_data = read_file_safe('valid.txt')
test_data = read_file_safe('test.txt')

# Extract unique characters from the training data
chars = sorted(set(train_data))
vocab_size = len(chars)
print(f"Vocabulary size (unique characters): {vocab_size}")

# Create character-to-integer (stoi) and integer-to-character (itos) mappings
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

# Save the mappings as a metadata file for future use
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}
meta_filepath = os.path.join(data_dir, 'meta.pkl')
with open(meta_filepath, 'wb') as f:
    pickle.dump(meta, f)
print(f"Saved metadata to {meta_filepath}")

# Function to encode data into integer token IDs
def encode_data(text):
    return [stoi[ch] for ch in text if ch in stoi]

# Encode the training, validation, and test data
train_ids = np.array(encode_data(train_data), dtype=np.uint16)
val_ids = np.array(encode_data(val_data), dtype=np.uint16)
test_ids = np.array(encode_data(test_data), dtype=np.uint16)

# Save the encoded data to binary files for efficient loading during training
train_bin_filepath = os.path.join(data_dir, 'train.bin')
val_bin_filepath = os.path.join(data_dir, 'val.bin')
test_bin_filepath = os.path.join(data_dir, 'test.bin')

train_ids.tofile(train_bin_filepath)
val_ids.tofile(val_bin_filepath)
test_ids.tofile(test_bin_filepath)

print(f"Data preparation complete. Files saved at: \n- {train_bin_filepath}\n- {val_bin_filepath}\n- {test_bin_filepath}")

Writing data/prepare_enwik8.py


In [5]:
!python data/prepare_enwik8.py

Vocabulary size (unique characters): 5486
Saved metadata to data/enwik8/meta.pkl
Data preparation complete. Files saved at: 
- data/enwik8/train.bin
- data/enwik8/val.bin
- data/enwik8/test.bin


In [6]:
# Create a new configuration file for the modified model

%%writefile config/enwik8_char_rope.py
import math

# Configuration for the modified model
out_dir = 'out-enwik8-char-rope'  # Output directory for model checkpoints and logs
eval_interval = 500
eval_iters = 200
log_interval = 100

always_save_checkpoint = True  # Ensure we save checkpoints
wandb_log = False
wandb_project = 'enwik8-char'
wandb_run_name = 'gpt2-enwik8-char-rope'

dataset = 'enwik8'
gradient_accumulation_steps = 1
batch_size = 64  # Adjust based on your GPU memory
block_size = 256  # Context length

# Model parameters
n_layer = 8
n_head = 8
n_embd = 512
dropout = 0.1  # Added some dropout for regularization
bias = False  # No bias in LayerNorm and Linear layers

# Optimization parameters
learning_rate = 1e-3
max_iters = 5000  # Number of iterations for training
lr_decay_iters = 5000
min_lr = 1e-4
beta1 = 0.9
beta2 = 0.99
weight_decay = 0.1
grad_clip = 1.0
decay_lr = True
warmup_iters = 100
init_from = 'scratch'  # Initialize model from scratch

# Use the modified model
model_type = 'rope'

# System parameters
device = 'cuda'  # Use CUDA for training
dtype = 'float16'  # Use float16 for faster training
compile = False  # Disable compilation for now

Writing config/enwik8_char_rope.py


In [7]:
# Write the modified model with RoPE

%%writefile model_rope.py
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from model import GPTConfig
import inspect

def apply_rotary_pos_emb(q, cos, sin):
    # Apply rotary position embedding to query and key
    q_cos = q * cos
    q_sin = q * sin
    q_rotated = q_cos + rotate_half(q_sin)
    return q_rotated


def rotate_half(x):
    # Helper function to apply rotation
    x1 = x[..., :x.shape[-1]//2]
    x2 = x[..., x.shape[-1]//2:]
    return torch.cat((-x2, x1), dim=-1)

class GPTWithRoPE(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # Initialize weights
        self.apply(self._init_weights)
        # Apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # Report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wte.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"

        # Token embeddings
        tok_emb = self.transformer.wte(idx)  # shape (b, t, n_embd)

        x = self.transformer.drop(tok_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :])  # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # Start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # Filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # Create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))

        # Precompute rotary embeddings
        self.rotary_emb = RotaryEmbedding(dim=config.n_embd // config.n_head)

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x).view(B, T, 3, self.n_head, C // self.n_head).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]  # Each is (B, n_head, T, head_dim)

        # Apply rotary embeddings to q and k
        q, k = self.rotary_emb(q, k)  # Correcting this line

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y


class RotaryEmbedding(nn.Module):
    def __init__(self, dim):
        super().__init__()
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer('inv_freq', inv_freq)

    def forward(self, q, k):
        t = q.size(-2)
        freqs = torch.einsum("i,j->ij", torch.arange(t, device=q.device).float(), self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        cos = emb.cos()[None, None, :, :]
        sin = emb.sin()[None, None, :, :]
        q = apply_rotary_pos_emb(q, cos, sin)
        k = apply_rotary_pos_emb(k, cos, sin)  # Fix the call for 'k'
        return q, k


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = F.gelu(x)  # Use standard GELU
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

Writing model_rope.py


In [8]:
%%writefile train.py
import argparse
import os
import time
import math
import pickle
from contextlib import nullcontext

import numpy as np
import torch
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
from tqdm import tqdm

from model import GPTConfig
from model_baseline import BaselineGPT
from model_rope import GPTWithRoPE

def get_serializable_config(config):
    return {k: v for k, v in config.items() if isinstance(v, (int, float, str, bool, type(None))) and not k.startswith('__')}

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, required=True, help='Configuration file')
    args = parser.parse_args()

    config_file = args.config
    config = {}
    with open(config_file, 'r') as f:
        exec(f.read(), {}, config)

    config = {k: v for k, v in config.items() if not k.startswith('__')}

    if 'out_dir' not in config:
        print("Error: 'out_dir' not specified in the configuration file.")
        return

    if int(os.environ.get('RANK', -1)) == -1:
        os.makedirs(config['out_dir'], exist_ok=True)
        print(f"Output directory: {config['out_dir']}")

    ddp = int(os.environ.get('RANK', -1)) != -1
    if ddp:
        init_process_group(backend='nccl')
        ddp_rank = int(os.environ['RANK'])
        ddp_local_rank = int(os.environ['LOCAL_RANK'])
        ddp_world_size = int(os.environ['WORLD_SIZE'])
        device = f'cuda:{ddp_local_rank}'
        torch.cuda.set_device(device)
        master_process = ddp_rank == 0
        config['gradient_accumulation_steps'] //= ddp_world_size
    else:
        master_process = True
        ddp_world_size = 1
        device = config['device']

    tokens_per_iter = (config['gradient_accumulation_steps'] * ddp_world_size *
                       config['batch_size'] * config['block_size'])
    print(f"Tokens per iteration will be: {tokens_per_iter:,}")

    torch.manual_seed(1337 + int(time.time()))
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    device_type = 'cuda' if 'cuda' in device else 'cpu'
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[config['dtype']]
    ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

    data_dir = os.path.join('data', config['dataset'])

    def get_batch(split):
        data_path = os.path.join(data_dir, f'{split}.bin')
        data = np.memmap(data_path, dtype=np.uint16, mode='r')
        ix = torch.randint(len(data) - config['block_size'], (config['batch_size'],))
        x = torch.stack([torch.from_numpy((data[i:i+config['block_size']]).astype(np.int64)) for i in ix])
        y = torch.stack([torch.from_numpy((data[i+1:i+1+config['block_size']]).astype(np.int64)) for i in ix])
        if device_type == 'cuda':
            x = x.pin_memory().to(device, non_blocking=True)
            y = y.pin_memory().to(device, non_blocking=True)
        else:
            x, y = x.to(device), y.to(device)
        return x, y

    meta_path = os.path.join(data_dir, 'meta.pkl')
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    config['vocab_size'] = vocab_size

    gpt_config_keys = ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size', 'dropout']
    gpt_config = {k: v for k, v in config.items() if k in gpt_config_keys}
    gptconf = GPTConfig(**gpt_config)

    if config.get('model_type') == 'rope':
        model = GPTWithRoPE(gptconf)
        print("Using GPTWithRoPE model.")
    else:
        model = BaselineGPT(gptconf)
        print("Using BaselineGPT model.")

    model.to(device)

    # Initialize optimizer outside of the model
    decay_params = [p for p in model.parameters() if p.dim() >= 2]
    no_decay_params = [p for p in model.parameters() if p.dim() < 2]

    optimizer = optim.AdamW([
        {'params': decay_params, 'weight_decay': config['weight_decay']},
        {'params': no_decay_params, 'weight_decay': 0.0}
    ], lr=config['learning_rate'], betas=(config['beta1'], config['beta2']))

    scaler = torch.cuda.amp.GradScaler(enabled=(config['dtype'] == 'float16'))

    iter_num = 0
    best_val_loss = 1e9

    if config.get('init_from', 'scratch') == 'resume':
        print(f"Resuming training from {config['out_dir']}")
        ckpt_path = os.path.join(config['out_dir'], 'ckpt.pt')
        checkpoint = torch.load(ckpt_path, map_location=device)
        model.load_state_dict(checkpoint['model'], strict=False)
        optimizer.load_state_dict(checkpoint['optimizer'])
        iter_num = checkpoint['iter_num']
        best_val_loss = checkpoint['best_val_loss']
        print(f"Resumed from iteration {iter_num}, best val loss {best_val_loss}")

    total_params = sum(p.numel() for p in model.parameters())
    print(f"Number of parameters: {total_params/1e6:.2f}M")

    @torch.no_grad()
    def estimate_loss():
        out = {}
        model.eval()
        for split in ['train', 'val']:
            losses = torch.zeros(config['eval_iters'])
            for k in range(config['eval_iters']):
                X, Y = get_batch(split)
                with ctx:
                    logits, loss = model(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
        model.train()
        return out

    def get_lr(it):
        if it < config['warmup_iters']:
            return config['learning_rate'] * it / config['warmup_iters']
        if it > config['lr_decay_iters']:
            return config['min_lr']
        decay_ratio = (it - config['warmup_iters']) / (config['lr_decay_iters'] - config['warmup_iters'])
        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
        return config['min_lr'] + coeff * (config['learning_rate'] - config['min_lr'])

    X, Y = get_batch('train')
    running_mfu = -1.0
    t0 = time.time()

    local_iter_num = 0
    raw_model = model.module if ddp else model

    with tqdm(total=config['max_iters'], desc="Training Progress") as pbar:
        while iter_num < config['max_iters']:
            lr = config['learning_rate'] if not config['decay_lr'] else get_lr(iter_num)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            for micro_step in range(config['gradient_accumulation_steps']):
                if ddp:
                    model.require_backward_grad_sync = (micro_step == config['gradient_accumulation_steps'] - 1)
                with ctx:
                    logits, loss = model(X, Y)
                    loss = loss / config['gradient_accumulation_steps']
                X, Y = get_batch('train')
                scaler.scale(loss).backward()

            if config['grad_clip'] != 0.0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config['grad_clip'])
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

            t1 = time.time()
            dt = t1 - t0
            t0 = t1

            if iter_num % config['eval_interval'] == 0 and master_process:
                losses = estimate_loss()
                print(f"\nStep {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
                if losses['val'] < best_val_loss or config['always_save_checkpoint']:
                    best_val_loss = losses['val']
                    checkpoint = {
                        'model': raw_model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'iter_num': iter_num,
                        'best_val_loss': best_val_loss,
                        'config': get_serializable_config(config),
                    }
                    checkpoint_path = os.path.join(config['out_dir'], 'ckpt.pt')
                    torch.save(checkpoint, checkpoint_path)
                    print(f"Saved checkpoint to {checkpoint_path}")

            iter_num += 1
            local_iter_num += 1
            pbar.update(1)

            if iter_num % config['log_interval'] == 0 and master_process:
              lossf = loss.item() * config['gradient_accumulation_steps']
              print(f"Iter {iter_num}: loss {lossf:.4f}")


    if ddp:
        destroy_process_group()

if __name__ == '__main__':
    main()

Overwriting train.py


In [9]:
%%writefile model_baseline.py

import torch
import torch.nn as nn
from torch.nn import functional as F
import math

# Define GPTBlock with MultiheadAttention
class GPTBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = nn.MultiheadAttention(config.n_embd, config.n_head, dropout=config.dropout)
        self.drop = nn.Dropout(config.dropout)
        self.ln_2 = nn.LayerNorm(config.n_embd)

        # Feed-forward layers
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        # Apply layer normalization
        x_ln = self.ln_1(x)

        # Self-attention uses x_ln as query, key, and value
        attn_output, _ = self.attn(x_ln, x_ln, x_ln)
        x = x + self.drop(attn_output)

        # Feedforward block with residual connection
        x = x + self.drop(self.mlp(self.ln_2(x)))

        return x

# Define the main BaselineGPT model
class BaselineGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        self.config = config

        # Transformer components
        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embd),    # Token embedding
            'wpe': nn.Embedding(config.block_size, config.n_embd),    # Positional embedding
            'drop': nn.Dropout(config.dropout),                       # Dropout
            'h': nn.ModuleList([GPTBlock(config) for _ in range(config.n_layer)]),  # Stack of GPT blocks
            'ln_f': nn.LayerNorm(config.n_embd),                      # Final layer normalization
        })

        # Language modeling head
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self._init_weights()

    def _init_weights(self):
        # Initialize the weights for all components
        for module in self.modules():
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()

        # Generate position indices and compute token and positional embeddings
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)  # (1, t)
        tok_emb = self.transformer['wte'](idx)  # (b, t, n_embd)
        pos_emb = self.transformer['wpe'](pos)  # (1, t, n_embd)

        # Combine token and positional embeddings, then apply dropout
        x = self.transformer['drop'](tok_emb + pos_emb)

        # Pass through the stack of GPT blocks
        for block in self.transformer['h']:
            x = block(x)

        # Final layer normalization
        x = self.transformer['ln_f'](x)

        # Compute logits for language modeling
        logits = self.lm_head(x)

        # If targets are provided, compute loss
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
            return logits, loss

        # Return logits if no targets are provided
        return logits, None


Writing model_baseline.py


In [14]:
# Create a configuration file for the baseline model

%%writefile config/enwik8_char_baseline.py
out_dir = 'out-enwik8-char'
eval_interval = 500
eval_iters = 200
log_interval = 100

always_save_checkpoint = True
wandb_log = False
wandb_project = 'enwik8-char'
wandb_run_name = 'gpt2-enwik8-char-baseline'

dataset = 'enwik8'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256

n_layer = 12
n_head = 8
n_embd = 384
dropout = 0.1
bias = False

learning_rate = 1e-4
max_iters = 5000
lr_decay_iters = 5000
min_lr = 1e-5
beta1 = 0.9
beta2 = 0.95
weight_decay = 0.1
grad_clip = 1.0
decay_lr = True
warmup_iters = 100
init_from = 'scratch'

device = 'cuda'
dtype = 'float16'
compile = False

Overwriting config/enwik8_char_baseline.py


In [15]:
# Train the baseline model
!python train.py --config config/enwik8_char_baseline.py

Output directory: out-enwik8-char
Tokens per iteration will be: 16,384
Using BaselineGPT model.
  scaler = torch.cuda.amp.GradScaler(enabled=(config['dtype'] == 'float16'))
Number of parameters: 25.61M
Training Progress:   0% 0/5000 [00:00<?, ?it/s]
Step 0: train loss 8.7378, val loss 8.7341
Saved checkpoint to out-enwik8-char/ckpt.pt
Training Progress:   2% 100/5000 [01:18<26:29,  3.08it/s]Iter 100: loss 4.2830
Training Progress:   4% 200/5000 [01:50<25:31,  3.13it/s]Iter 200: loss 2.9377
Training Progress:   6% 300/5000 [02:22<25:30,  3.07it/s]Iter 300: loss 2.8258
Training Progress:   8% 400/5000 [02:55<24:49,  3.09it/s]Iter 400: loss 2.8007
Training Progress:  10% 500/5000 [03:27<24:19,  3.08it/s]Iter 500: loss 2.7109

Step 500: train loss 2.7367, val loss 2.7679
Saved checkpoint to out-enwik8-char/ckpt.pt
Training Progress:  12% 600/5000 [04:46<24:12,  3.03it/s]Iter 600: loss 2.6838
Training Progress:  14% 700/5000 [05:19<22:59,  3.12it/s]Iter 700: loss 2.7701
Training Progress:  

In [12]:
# Train the modified model with RoPE

!python train.py --config config/enwik8_char_rope.py

Output directory: out-enwik8-char-rope
Tokens per iteration will be: 16,384
number of parameters: 27.99M
Using GPTWithRoPE model.
  scaler = torch.cuda.amp.GradScaler(enabled=(config['dtype'] == 'float16'))
Number of parameters: 30.80M
Training Progress:   0% 0/5000 [00:00<?, ?it/s]
Step 0: train loss 8.7755, val loss 8.7751
Saved checkpoint to out-enwik8-char-rope/ckpt.pt
Training Progress:   2% 100/5000 [01:42<32:04,  2.55it/s]Iter 100: loss 2.2703
Training Progress:   4% 200/5000 [02:21<30:51,  2.59it/s]Iter 200: loss 1.9288
Training Progress:   6% 300/5000 [03:00<30:29,  2.57it/s]Iter 300: loss 1.8749
Training Progress:   8% 400/5000 [03:39<29:50,  2.57it/s]Iter 400: loss 1.6522
Training Progress:  10% 500/5000 [04:18<29:10,  2.57it/s]Iter 500: loss 1.6242

Step 500: train loss 1.5567, val loss 1.5576
Saved checkpoint to out-enwik8-char-rope/ckpt.pt
Training Progress:  12% 600/5000 [06:04<29:11,  2.51it/s]Iter 600: loss 1.5002
Training Progress:  14% 700/5000 [06:43<27:39,  2.59it/

In [12]:
# Create the evaluation script

%%writefile evaluate.py
import torch
import numpy as np
import argparse
import pickle
import math
from model import GPTConfig, GPT
from model_rope import GPTWithRoPE

def evaluate(model, data_loader, device):
    model.eval()
    losses = []
    with torch.no_grad():
        for x, y in data_loader:
            x = x.to(device)
            y = y.to(device)
            with torch.amp.autocast(device_type=device):
                logits, loss = model(x, y)
            losses.append(loss.item())
    return np.mean(losses)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_type', type=str, choices=['gpt', 'rope'], default='gpt', help='Model type: gpt or rope')
    parser.add_argument('--dataset', type=str, default='enwik8', help='Dataset name')
    parser.add_argument('--checkpoint', type=str, required=True, help='Checkpoint file')
    args = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Load the checkpoint
    checkpoint = torch.load(args.checkpoint, map_location=device)

    # Load the model configuration from the checkpoint
    ckpt_config = checkpoint['config']

    # Update vocab_size from the dataset's meta.pkl
    with open(f"data/{args.dataset}/meta.pkl", 'rb') as f:
        meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    ckpt_config['vocab_size'] = vocab_size

    # Filter ckpt_config to only include keys that GPTConfig accepts
    valid_config_keys = ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size', 'dropout']
    model_config_kwargs = {k: ckpt_config[k] for k in valid_config_keys if k in ckpt_config}

    # Model configuration
    model_config = GPTConfig(**model_config_kwargs)

    # Instantiate the model
    if args.model_type == 'rope' or ckpt_config.get('model_type') == 'rope':
        model = GPTWithRoPE(model_config)
        print("Using GPTWithRoPE model.")
    else:
        model = GPT(model_config)
        print("Using BaselineGPT model.")

    # Load the model state
    model.load_state_dict(checkpoint['model'], strict=False)
    model.to(device)

    # Prepare data loader
    block_size = ckpt_config['block_size']
    batch_size = ckpt_config.get('batch_size', 64)  # Default to 64 if not specified

    # Load validation data
    val_data = np.memmap(f'data/{args.dataset}/val.bin', dtype=np.uint16, mode='r')
    val_data = torch.from_numpy(val_data.astype(np.int64))

    # Create sequences of block_size
    num_tokens = len(val_data) - 1
    x_tokens = val_data[:num_tokens]
    y_tokens = val_data[1:num_tokens+1]

    # Ensure that the number of tokens is a multiple of block_size
    num_batches = num_tokens // block_size
    x_tokens = x_tokens[:num_batches * block_size]
    y_tokens = y_tokens[:num_batches * block_size]

    # Reshape into batches
    x_batches = x_tokens.view(-1, block_size)
    y_batches = y_tokens.view(-1, block_size)

    val_dataset = torch.utils.data.TensorDataset(x_batches, y_batches)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    # Evaluate
    val_loss = evaluate(model, val_loader, device)
    bpc = val_loss / math.log(2)
    print(f"Validation Loss: {val_loss:.4f}, Bits per character (bpc): {bpc:.4f}")

Writing evaluate.py


In [47]:
# Evaluate the baseline model

!python evaluate.py --model_type gpt --checkpoint out-enwik8-char/ckpt.pt

  checkpoint = torch.load(args.checkpoint, map_location=device)
number of parameters: 23.35M
Using BaselineGPT model.
Validation Loss: 4.3922, Bits per character (bpc): 6.3367


In [15]:
# Evaluate the modified model

!python evaluate.py --model_type rope --checkpoint out-enwik8-char-rope/ckpt.pt

  checkpoint = torch.load(args.checkpoint, map_location=device)
number of parameters: 27.99M
Using GPTWithRoPE model.
Validation Loss: 1.0763, Bits per character (bpc): 1.5528


In [1]:
from tabulate import tabulate

# Data for the table
data = [
    ["BaselineGPT", "23.35", "6.3367"],
    ["GPTWithRoPE", "27.99", "1.5528"],
]

# Create the table
headers = ["Model", "Parameters (M)", "bpc"]
table = tabulate(data, headers, tablefmt="pipe")

# Print the table
print(table)

| Model       |   Parameters (M) |    bpc |
|:------------|-----------------:|-------:|
| BaselineGPT |            23.35 | 6.3367 |
| GPTWithRoPE |            27.99 | 1.5528 |
