In [1]:
pip install transformers tiktoken

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting tiktoken
  Downloading http://mirrors.tencentyun.com/pypi/packages/5f/36/93115cf8bdb62284dd64c01893b208ac46e586cb6e01ad34bbe0784b90f4/tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
import regex as re
import requests
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
import pickle
import math
import time
from collections import defaultdict
import tiktoken

# 0) Generative Pre-trained Transformer (GPT)

GPT model only has decoder block.
While encoder block only generate the output with similar length as input,
decoder is generative in nature.


GPT-2 does not require the encoder part of the transformer architecture because the model uses a masked self-attention that can only look at prior tokens. The encoder is not needed because the model does not need to learn the representation of the input sequence.


It produces estimates for the probability of the next word as outputs but it is auto-regressive as each token in the sentence has the context of the previous words. Thus GPT-2 works one token at a time.


BERT, by contrast, is not auto-regressive. It uses the entire surrounding context all-at-once. GPT-2 the context vector is zero-initialized for the first word embedding.
**

# 1) Tokenization (Byte Pair Encoding)

The GPT-2 and RoBERTa tokenizers (which are pretty similar) have a clever way to deal with this: they don’t look at words as being written with Unicode characters, but with bytes. This way the base vocabulary has a small size (256), but every character you can think of will still be included and not end up being converted to the unknown token. This trick is called byte-level BPE.

Description: https://huggingface.co/course/chapter6/5?fw=pt

Code: https://github.com/karpathy/minGPT/blob/master/mingpt/bpe.py

# 2) Dataset

In [3]:
# download the tiny shakespeare dataset
data_dir = os.path.join('data', 'tinyshakespeare')
input_file_path = os.path.join(data_dir, 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    os.makedirs(data_dir)
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode with tiktoken gpt2 bpe
enc = tiktoken.get_encoding("gpt2")
train_ids = enc.encode_ordinary(train_data)
val_ids = enc.encode_ordinary(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(os.path.join(data_dir, 'train.bin'))
val_ids.tofile(os.path.join(data_dir, 'val.bin'))

train has 301,966 tokens
val has 36,059 tokens


In [4]:
class GPTConfig:
    def __init__(self, vocab_size, **kwargs):
        self.vocab_size = vocab_size
        for key, value in kwargs.items():
            setattr(self, key, value)

class CustomConfig(GPTConfig):
    # model
    n_layer = 8
    n_head = 8
    n_embd = 256
    embd_pdrop = 0.1
    resid_pdrop = 0.1
    attn_pdrop = 0.1
    dropout = 0.1
    compile = True

    # data
    device = 'cuda'
    num_workers = 0

    # optimizer parameters
    max_iters = 2e4
    batch_size = 4
    block_size = 64
    learning_rate = 6e-4
    betas = (0.9, 0.95)
    weight_decay = 1e-1
    grad_norm_clip = 1.0

# config
vocab_size = len(train_ids)
config = CustomConfig(vocab_size=vocab_size)

In [5]:
# read data from .bin
data_dir = os.path.join('data', 'tinyshakespeare')
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')

class ShakespeareDataset(Dataset):
    def __init__(self, split, block_size=128, device_type='cuda'):
        assert split in {'train', 'test'}
        self.split = split
        self.block_size = block_size
        self.device_type = device_type
        self.data = train_data if split == 'train' else val_data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # ix = torch.randint(len(data) - block_size, (batch_size,))
        # x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
        # y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
        x = torch.from_numpy(self.data[idx : idx + self.block_size].astype(np.int64))
        y = torch.from_numpy(self.data[idx + 1 : idx + 1 + self.block_size].astype(np.int64)) 

        if self.device_type == 'cuda':
            # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
            x, y = x.pin_memory().to('cuda', non_blocking=True), y.pin_memory().to('cuda', non_blocking=True)
        else:
            x, y = x.to('cpu'), y.to('cpu')
        return x, y

train_dataset = ShakespeareDataset('train', config.block_size, config.device)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, num_workers=config.num_workers, drop_last=False)
test_dataset = ShakespeareDataset('test', config.block_size, config.device)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size, num_workers=config.num_workers, drop_last=False)
sample_data = next(iter(train_loader))
x, y = sample_data
print("x:", x.size())
print("y:", y.size())

x: torch.Size([4, 64])
y: torch.Size([4, 64])


# 3) Modeling

### code above defines a class CausalSelfAttention that implements the causal self-attention mechanism in the GPT model.##

- The forward method is where the actual computation takes place. The method receives a tensor x of shape (batch_size, seq_len, emb_dim) as input.
- It splits the input x into query, key, and value tensors for all heads and reshapes them accordingly. It then computes the attention score matrix using either the fast flash attention (torch.version ≥ 2.0)or the slower dot product method, depending on the pytorch version.
- In the case of dot product attention, the attention score matrix is computed using matrix multiplication between the query and key tensors, followed by scaling by the square root of the key tensor’s dimension.
- A mask is then applied to ensure that the attention is only applied to the left in the input sequence. In GPT, the masking is done using a triangular mask that blocks the model from attending to any word that comes after the current word in the sequence. To achieve this, we use torch.tril(torch.ones(n, n)) to create a lower-triangular matrix of ones. The tril function zeros out all elements above the diagonal of the matrix.
- The resulting matrix is then normalized using the softmax function and multiplied by the value tensor to obtain the output. All these steps that we mentioned are actually the simple translation from the equation
- Finally, the output is projected onto the same dimensionality as the input, using a residual connection and the output projection

In [6]:
class NewGELU(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


class CausalSelfAttention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    It's important in decoder block to have diagonal mask
    It is also possible to use torch.nn.MultiheadAttention.
    """

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        self.dropout = config.dropout
        self.n_head = config.n_head
        self.n_embd = config.n_embd

        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer(
                "mask", 
                torch.tril(torch.ones(config.block_size, config.block_size)
            ).view(1, 1, config.block_size, config.block_size))
            
    def forward(self, x):
        # batch_size, seq_len, emb_dim
        B, T, C = x.size() 

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # (b, seq_len, emb_dim) --> (b, seq_len, emb_dim * 3) --> (b, seq_len, emb_dim)
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (b, h, seq_len, d_k)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (b, h, seq_len, d_k)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (b, h, seq_len, d_k)
        
        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(
                q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True
            )
        else:
            # (b, h, seq_len, d_k) matmul (b, h, d_k, seq_len) --> (b, h, seq_len, seq_len)
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            # diagonal mask
            # fill 0 mask with super small number so it wont affect the softmax weight
            # (batch_size, h, seq_len, seq_len)
            att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)

            # (b, h, seq_len, seq_len) matmul (b, h, seq_len, d_k) --> (b, h, seq_len, d_k)
            y = att @ v 

        # (b, h, seq_len, d_k) --> (b, seq_len, h, d_k) --> (b, seq_len, d_model)
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


class Block(nn.Module):
    """ GPT only contain decode block"""

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)

        self.mlp = nn.ModuleDict(dict(
            c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd),
            act     = NewGELU(),
            c_proj  = nn.Linear(4 * config.n_embd, config.n_embd),
            dropout = nn.Dropout(config.resid_pdrop),
        ))
        m = self.mlp
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x))))

    def forward(self, x):
        
        # (batch_size, seq_len, emb_dim)
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))
        return x

### testing
wte = nn.Embedding(config.vocab_size, config.n_embd).to(config.device)
block = Block(config).to(config.device)

tok_emb = wte(x)
print('Token Embedding Size:', tok_emb.size())

block_out = block(tok_emb)
print('Block Output Size:', block_out.size())

Token Embedding Size: torch.Size([4, 64, 256])
Block Output Size: torch.Size([4, 64, 256])


- The constructor (__init__) initializes the GPT model with the given configuration. The GPT model combined several components which are the embedding layer for word tokens wte, embedding layer for positional encoding wpe, decoder blocks Block and finally a layer normalization layer applied to the output of the transformer ln_f.
- Meanwhile, the constructor initializes the weights of the GPT model using a special scaled initialization technique, as described in the GPT-2 paper. It also sets up an optimizer for training the model, with separate weight decay settings for different parts of the model.
- The forward method computes the forward pass of the GPT model. It takes as input a tensor of word indices (idx) and a tensor of target indices (targets). The method first applies an embedding layer to the word indices and a positional encoding layer to the position indices. It then applies the transformer layers to the resulting tensor.
- Next, it applies the language model head to the output of the transformer to obtain a probability distribution over the vocabulary.
- Lastly, it computes the cross-entropy loss between the predicted distribution and the target distribution.

In [7]:
class GPT(nn.Module):
    """ GPT Language Model """

    def __init__(self, config):
        super().__init__()
        self.block_size = config.block_size

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.embd_pdrop),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    def configure_optimizers(self, train_config):

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        
        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"

        # positional token, shape (1, t)
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) 

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        
        x = self.transformer.ln_f(x)
        # (b, t, n_embd) -- > # (b, t, vocab_size)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        # -1 at output will be ignored
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b, t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

### testing
wte = nn.Embedding(config.vocab_size, config.n_embd).to(config.device)
model = GPT(config).to(config.device)
model = torch.compile(model)

# sample dataset from data loader
logits, loss = model.forward(x, y)
print('logits: ', logits.size())
print('loss: ', loss)

number of parameters: 83.64M


  from .autonotebook import tqdm as notebook_tqdm


logits:  torch.Size([4, 64, 301966])
loss:  tensor(12.6740, device='cuda:0', grad_fn=<CompiledFunctionBackward>)


### Word Generation ###
GPT is an auto-regressive language model that takes in a conditioning sequence of indices and then generates new text one token at a time. The model generates each token based on the preceding tokens in the sequence.

- The generate function is a method in the GPT class that generates new text based on a given input sequence. It takes in a conditioning sequence of indices idx of shape (batch size, sequence length). The function then completes the sequence max_new_tokens times, feeding the predictions back into the model each time.
-It forward passes the model to get the logits for the index in the sequence. The logits represent the unnormalized probability distribution over the vocabulary of possible tokens.
- Next, the function plucks the logits at the final step and scales them by a desired temperature. The temperature is used to control the randomness of the generated output. Higher temperatures lead to more diverse and random outputs, while lower temperatures lead to more conservative and predictable outputs.
- Then, it applies softmax to convert the logits to normalized probabilities. The probabilities represent the likelihood of each token in the vocabulary to be the next token in the generated sequence.
- Finally, the function either samples from the probability distribution using torch.multinomial(). It then appends the sampled index to the running sequence and continues the loop until max_new_tokens is reached.

In [13]:
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(301966, 256)
    (wpe): Embedding(64, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x Block(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=256, out_features=768, bias=True)
          (c_proj): Linear(in_features=256, out_features=256, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=256, out_features=1024, bias=True)
          (act): NewGELU()
          (c_proj): Linear(in_features=1024, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_f

### Trainer ###
After thoroughly exploring the GPT model and analyzing the provided source code, we are now equipped with the knowledge and understanding necessary to train the model using Shakespearean data. We can now confidently hit the “begin” button to initiate the training process and watch as the model learns to generate Shakespearean text.

This code defines the Trainer class, which is responsible for training the GPT model.

- It sets up a dictionary of callbacks, which will be triggered at various events during the training process.
- The run method of the Trainer class sets up the optimizer and data loader, and then enters a loop that trains the model.
- In each iteration of the loop, a batch of data is fetched, the model is forward-propagated, the loss is calculated, the gradients are backpropagated, and the parameters are updated using an optimizer.
- The training progress is logged using the batch_end_callback, which is called every n iterations. The loop continues until a termination condition is met, which could be a maximum number of iterations specified in the configuration object.
- A lightweight model is trained after 20000 iteration.

In [8]:
class Trainer:

    def __init__(self, config, model, train_dataset):
        self.config = config
        self.model = model
        self.optimizer = None
        self.train_dataset = train_dataset
        self.callbacks = defaultdict(list)
        self.device = config.device
        self.model = self.model.to(self.device)

        # variables that will be assigned to trainer class later for logging and etc
        self.iter_num = 0
        self.iter_time = 0.0
        self.iter_dt = 0.0

    def add_callback(self, onevent: str, callback):
        self.callbacks[onevent].append(callback)

    def set_callback(self, onevent: str, callback):
        self.callbacks[onevent] = [callback]

    def trigger_callbacks(self, onevent: str):
        for callback in self.callbacks.get(onevent, []):
            callback(self)

    def run(self):
        model, config = self.model, self.config

        # setup the optimizer
        self.optimizer = model.configure_optimizers(config)

        # setup the dataloader
        train_loader = DataLoader(
            self.train_dataset,
            sampler=torch.utils.data.RandomSampler(self.train_dataset, replacement=True, num_samples=int(1e10)),
            shuffle=False,
            # pin_memory=True,
            batch_size=config.batch_size,
            num_workers=config.num_workers,
        )

        model.train()
        self.iter_num = 0
        self.iter_time = time.time()
        data_iter = iter(train_loader)
        while True:

            # fetch the next batch (x, y) and re-init iterator if needed
            try:
                batch = next(data_iter)
            except StopIteration:
                data_iter = iter(train_loader)
                batch = next(data_iter)
            batch = [t.to(self.device) for t in batch]
            x, y = batch

            # forward the model
            logits, self.loss = model(x, y)

            # backprop and update the parameters
            model.zero_grad(set_to_none=True)
            self.loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
            self.optimizer.step()

            self.trigger_callbacks('on_batch_end')
            self.iter_num += 1
            tnow = time.time()
            self.iter_dt = tnow - self.iter_time
            self.iter_time = tnow

            # termination conditions
            if config.max_iters is not None and self.iter_num >= config.max_iters:
                break

model = GPT(config)
trainer = Trainer(config, model, train_dataset)
trainer = Trainer(config, model, train_dataset)

def batch_end_callback(trainer):
    if trainer.iter_num % 500 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

number of parameters: 83.64M
iter_dt 0.00ms; iter 0: train loss 12.68121
iter_dt 107.74ms; iter 500: train loss 5.84700
iter_dt 109.06ms; iter 1000: train loss 5.15548
iter_dt 109.84ms; iter 1500: train loss 4.64236
iter_dt 107.30ms; iter 2000: train loss 5.25185
iter_dt 109.86ms; iter 2500: train loss 4.20766
iter_dt 126.89ms; iter 3000: train loss 4.47053
iter_dt 114.59ms; iter 3500: train loss 4.71512
iter_dt 106.33ms; iter 4000: train loss 4.37974
iter_dt 107.54ms; iter 4500: train loss 4.69097
iter_dt 106.41ms; iter 5000: train loss 4.37470
iter_dt 107.34ms; iter 5500: train loss 4.54241
iter_dt 106.78ms; iter 6000: train loss 4.19148
iter_dt 111.34ms; iter 6500: train loss 4.45951
iter_dt 109.48ms; iter 7000: train loss 3.87133
iter_dt 111.91ms; iter 7500: train loss 4.31350
iter_dt 112.46ms; iter 8000: train loss 3.86543
iter_dt 110.74ms; iter 8500: train loss 3.99068
iter_dt 105.17ms; iter 9000: train loss 4.26273
iter_dt 120.13ms; iter 9500: train loss 3.57195
iter_dt 110.80ms

Now that the model has been trained, we can use it to generate Shakespearean text based on any random input.

In [9]:
text = 'Lord:\nRise! My people, conquer the north!'
sample_ids = torch.Tensor(enc.encode_ordinary(text)).long()
sample_ids = torch.unsqueeze(sample_ids, 0).to(config.device)
result = model.generate(sample_ids, max_new_tokens=50, temperature=1, do_sample=False, top_k=None)
print(enc.decode(result.detach().cpu().tolist()[0]))

Lord:
Rise! My people, conquer the north!

CAMILLO:
I am too.

FLORIZEL:
I am a man: I am a king, and I,
And I, that, to be a king,
And, by my father


In [12]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'nanoGPT-module') else model  # Take care of distributed/parallel training
torch.save(model.state_dict(), output_dir+"nanoGPT.path")

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))

Saving model to ./model_save/
