In [1]:

import os
import urllib.request
import torch

In [2]:


# Create the mingpt directory if it doesn't exist
if not os.path.exists("mingpt"):
    os.makedirs("mingpt")

# Download the model.py file
base_url = "https://github.com/karpathy/minGPT/raw/master/mingpt"
urllib.request.urlretrieve(f"{base_url}/model.py", "mingpt/model.py")
urllib.request.urlretrieve(f"{base_url}/utils.py", "mingpt/utils.py")
urllib.request.urlretrieve(f"{base_url}/trainer.py", "mingpt/trainer.py")

('mingpt/trainer.py', <http.client.HTTPMessage at 0x29fc0bd9790>)

In [3]:
# Load vectors
train_data = torch.load("azure-docs-training.pt")
val_data = torch.load("azure-docs-validation.pt")

In [4]:
import torch
from torch.utils.data import Dataset

class AzureDocsDataset(Dataset):
    def __init__(self, data, block_size):
        self.block_size = block_size
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq = self.data[idx:idx + self.block_size]
        target_seq = self.data[idx + 1:idx + self.block_size + 1]
        return input_seq, target_seq
    
train_dataset = AzureDocsDataset(data=train_data, block_size=128)
val_dataset = AzureDocsDataset(data=val_data, block_size=128)

In [5]:
# Example what it does - input is 5 tokens, target is shifted by one so model needs to predict that one new token (49 in example here)
AzureDocsDataset(data=train_data, block_size=5).__getitem__(3)

(tensor([   25, 22134,  5984,   311,  4303]),
 tensor([22134,  5984,   311,  4303,    49]))

In [6]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = None      # We will define hyperparameters explicitly
model_config.n_layer = 4            # 12 for gpt2, 36 for gpt2-large, 3 for playing
model_config.n_head = 4             # 12 for gpt2, 20 for gpt2-large, 3 for playing
model_config.n_embd = 48            # 768 for gpt2, 1280 for gpt2-large, 48 for playing
model_config.vocab_size = 50257     # gpt2 tokenizer is 50257
model_config.block_size = 128
model = GPT(model_config)

number of parameters: 2.53M


In [16]:
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 52
train_config.num_workers = 0
# train_config.batch_size = 32
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [17]:
def batch_end_callback(trainer):
    if trainer.iter_num % 10 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 10.39498
iter_dt 12707.25ms; iter 10: train loss 6.99273
iter_dt 9259.97ms; iter 20: train loss 7.00274
iter_dt 8995.30ms; iter 30: train loss 6.79745
iter_dt 11587.27ms; iter 40: train loss 6.35579
iter_dt 9007.79ms; iter 50: train loss 6.46167
