In [1]:
from pathlib import Path
import torch
import sys
sys.path.append('..')
from src.model import GPTLanguageModel

In [2]:
dataset_name = "TinyStories"
train_set_name = "TinyStories-train.txt"

In [3]:
file_path = Path.cwd() / '..' / 'data' / 'raw' / dataset_name / train_set_name
with open(file_path, 'r') as f:
    text = f.read()

# Explore the dataset

In [4]:
print(len(text))

1922767089


In [5]:
print(text[:1000])

One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.
Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."
Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.
<|endoftext|>
Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.
One day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were

# Tokenize the dataset
We will use characters as tokens, just as a baseline.

Note: special tokens already exist, like `<|endoftext|>`. We'll just ignore them for now, and generate infinite text.

Note: there appears to be different languages, symbols, and emojis. I can recognize Chinese characters. We'll ignore them for now.

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£§«­°´·»¿ÂÉßàáâåèéêíïñóöúāİœɪʏʙʜіғᴀᴄᴅᴇᴏᴛᴜᴡᴢ   ​‌‎‐‑‒–—―‘’‚“”„…  ‪′€™−─❤　。」一了些他但保個們兒兩分到剛又和在天奮她己巴度很恩應把整是時會獨玉田留當的童答米給自興艾莉裡這過難高ﬁﬂ️﻿，￼�𝑐🌴🌹🍌🍞🎓💖🙂🤩
243


In [7]:
class TokenizerBase:
    def encode(self, s):
        raise NotImplementedError
    
    def decode(self, t):
        raise NotImplementedError


class CharacterTokenizer(TokenizerBase):
    def __init__(self, chars):
        self.stoi = { ch:i for i, ch in enumerate(chars) }
        self.itos = { i:ch for i, ch in enumerate(chars) }
    
    def encode(self, s):
        return [self.stoi[c] for c in s]
    
    def decode(self, l):
        return ''.join([self.itos[i] for i in l])

In [8]:
tokenizer = CharacterTokenizer(chars)
encoded = tokenizer.encode("Hii there")
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[42, 74, 74, 2, 85, 73, 70, 83, 70]
Hii there


In [9]:
train_data = torch.tensor(tokenizer.encode(text), dtype=torch.long)
print(train_data[:1000])

tensor([49, 79, 70,  2, 69, 66, 90, 14,  2, 66,  2, 77, 74, 85, 85, 77, 70,  2,
        72, 74, 83, 77,  2, 79, 66, 78, 70, 69,  2, 46, 74, 77, 90,  2, 71, 80,
        86, 79, 69,  2, 66,  2, 79, 70, 70, 69, 77, 70,  2, 74, 79,  2, 73, 70,
        83,  2, 83, 80, 80, 78, 16,  2, 53, 73, 70,  2, 76, 79, 70, 88,  2, 74,
        85,  2, 88, 66, 84,  2, 69, 74, 71, 71, 74, 68, 86, 77, 85,  2, 85, 80,
         2, 81, 77, 66, 90,  2, 88, 74, 85, 73,  2, 74, 85,  2, 67, 70, 68, 66,
        86, 84, 70,  2, 74, 85,  2, 88, 66, 84,  2, 84, 73, 66, 83, 81, 16,  2,
        46, 74, 77, 90,  2, 88, 66, 79, 85, 70, 69,  2, 85, 80,  2, 84, 73, 66,
        83, 70,  2, 85, 73, 70,  2, 79, 70, 70, 69, 77, 70,  2, 88, 74, 85, 73,
         2, 73, 70, 83,  2, 78, 80, 78, 14,  2, 84, 80,  2, 84, 73, 70,  2, 68,
        80, 86, 77, 69,  2, 84, 70, 88,  2, 66,  2, 67, 86, 85, 85, 80, 79,  2,
        80, 79,  2, 73, 70, 83,  2, 84, 73, 74, 83, 85, 16,  1, 46, 74, 77, 90,
         2, 88, 70, 79, 85,  2, 85, 80, 

# Create a dataloader

In [10]:
context_length = 64

In [11]:
class DataLoader:
    def __init__(self, context_length, batch_size, data):
        self.context_length = context_length
        self.batch_size = batch_size
        self.data = data
    
    def get_batch(self):
        ix = torch.randint(len(self.data) - self.context_length, (self.batch_size,))
        x = torch.stack([self.data[i:i+self.context_length] for i in ix])
        y = torch.stack([self.data[i+1:i+self.context_length+1] for i in ix])
        return x, y

In [12]:
torch.manual_seed(1337)
dl_train = DataLoader(context_length=context_length, batch_size=32, data=train_data)
xb, yb = dl_train.get_batch()
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(dl_train.batch_size):
    for t in range(dl_train.context_length):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([32, 64])
tensor([[72, 70, 85,  ..., 44, 86, 84],
        [70, 85,  2,  ..., 71,  2, 85],
        [70, 84, 85,  ..., 67, 83, 74],
        ...,
        [90,  2, 72,  ..., 73, 70, 90],
        [78,  2, 79,  ..., 84, 85, 16],
        [ 2, 42, 70,  ..., 81, 14,  2]])
targets:
torch.Size([32, 64])
tensor([[70, 85,  2,  ..., 86, 84, 85],
        [85,  2, 72,  ...,  2, 85, 80],
        [84, 85,  2,  ..., 83, 74, 72],
        ...,
        [ 2, 72, 66,  ..., 70, 90,  2],
        [ 2, 79, 80,  ..., 85, 16,  2],
        [42, 70,  2,  ..., 14,  2, 36]])
----
when input is [72] the target: 70
when input is [72, 70] the target: 85
when input is [72, 70, 85] the target: 2
when input is [72, 70, 85, 2] the target: 73
when input is [72, 70, 85, 2, 73] the target: 86
when input is [72, 70, 85, 2, 73, 86] the target: 83
when input is [72, 70, 85, 2, 73, 86, 83] the target: 85
when input is [72, 70, 85, 2, 73, 86, 83, 85] the target: 2
when input is [72, 70, 85, 2, 73, 86, 83, 85, 2] th

In [13]:
val_set_name = 'TinyStories-valid.txt'
val_path = Path.cwd() / '..' / 'data' / 'raw' / dataset_name / val_set_name
with open(val_path, 'r') as f:
    val_text = f.read()

val_data = torch.tensor(tokenizer.encode(val_text), dtype=torch.long)

dl_val = DataLoader(context_length=context_length, batch_size=32, data=val_data)

In [14]:
@torch.no_grad()
def estimate_loss(model, dl_train, dl_val, eval_iters):
    out = {}
    model.eval()
    for name, dl in [('train', dl_train), ('val', dl_val)]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = dl.get_batch()
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        out[name] = losses.mean()
    model.train()
    return out

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPTLanguageModel(
    vocab_size=vocab_size,
    d_model=64,
    seq_len=context_length,
    n_layers=6,
    d_k=64,
    d_v=64,
    n_heads=6,
    device=device,
    dropout=0.2
)
m = model.to(device)

Count the number of parameters

In [16]:
print(sum(p.numel() for p in m.parameters()) / 1e6, 'M parameters')

0.825459 M parameters


In [17]:
learning_rate = 1e-3

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

Train the model.

In [18]:
from tqdm import tqdm

eval_iters = 200
max_iters = 5000
eval_interval = 500
ckpt_interval = 1000

for iter in tqdm(range(max_iters)):
    if iter % eval_interval == 0 or iter % ckpt_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model, dl_train, dl_val, eval_iters)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
        if iter % ckpt_interval == 0 or iter == max_iters - 1:
            # save the model at each checkpoint
            checkpoint = {
                'epoch': iter,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': losses
            }
            if iter == max_iters - 1:
                ckpt = "final"
            else:
                ckpt = iter // ckpt_interval

            ckpt_path = Path.cwd() / '..' / 'checkpoints' / f'checkpoint_{ckpt}.pt'
            torch.save(checkpoint, ckpt_path)

    
    xb, yb = dl_train.get_batch()

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 1/5000 [00:12<17:33:38, 12.65s/it]

step 0: train loss 5.7419, val loss 5.7416


 10%|█         | 501/5000 [01:20<5:00:00,  4.00s/it]

step 500: train loss 1.8440, val loss 1.8405


 20%|██        | 1001/5000 [02:28<4:30:53,  4.06s/it]

step 1000: train loss 1.5322, val loss 1.5342


 30%|███       | 1501/5000 [03:37<3:39:32,  3.76s/it]

step 1500: train loss 1.4114, val loss 1.4069


 40%|████      | 2001/5000 [04:45<3:25:32,  4.11s/it]

step 2000: train loss 1.3431, val loss 1.3410


 50%|█████     | 2501/5000 [05:51<2:36:58,  3.77s/it]

step 2500: train loss 1.2966, val loss 1.2926


 60%|██████    | 3001/5000 [06:57<2:16:09,  4.09s/it]

step 3000: train loss 1.2586, val loss 1.2599


 70%|███████   | 3501/5000 [08:03<1:29:36,  3.59s/it]

step 3500: train loss 1.2342, val loss 1.2375


 80%|████████  | 4001/5000 [09:04<31:01,  1.86s/it]  

step 4000: train loss 1.2122, val loss 1.2112


 90%|█████████ | 4502/5000 [10:09<15:45,  1.90s/it]

step 4500: train loss 1.1938, val loss 1.1914


100%|██████████| 5000/5000 [11:08<00:00,  7.48it/s]

step 4999: train loss 1.1834, val loss 1.1758





Generate from the model.

In [19]:
ckpt_path = Path.cwd() / '..' / 'checkpoints' / 'checkpoint_final.pt'
checkpoint = torch.load(ckpt_path)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
m = model.to(device)

  checkpoint = torch.load(ckpt_path)


In [21]:
context = torch.zeros((1, context_length), dtype=torch.long, device=device)
generated = m.generate(context, max_new_tokens=2000)[0].tolist()
decoded = tokenizer.decode(generated)
print(decoded)

																																																																. So excited, "Where it sand. They had the wind down. They wanted to go home. It was tried roll that on, he she had becore by for a big duch. She had a said, "Mom and Wen the too keap!" But said, she didn't know it came and the sun. He jot for the halpy on. It doods was a big big flace inside. She had not room would looked eather. The wanted to feight it and was very locts of his dall thrughth, it was was very rible. He saw anytoe things the little girl's happy that was a lift out important to it in the box. He hug to draw and encing calling and feel the little boy was book! Cit chiled with the mom on it too by tooks with play so friends." they both Mia back quiesting flew ug suite. Lily was shappy to her it to saap inside his seany, "I took not," started. You do your me we vat to go of about it. It was walking home." He gave to play outside in the box, she coor and started to feacher it was a doll. He was sturning so clim