In [1]:
# gpt like model, chatgpt as example
# under the hood, how do these work
# Attention is all you need : 2017 - Landmark paper
# GPT : Generative Pretrained Transformer

In [2]:
# Reads like a random machine translation paper
# ended up taking over rest of AI in the next 5 years
# transformer is the core gpt like systems
#
# build something like chatgpt, but of course, wont' be that great
# focus on training a transformer based language model from scratch
# character level lang model

In [3]:
# fairly sized dataset
# tinyshakespeare dataset

In [4]:
# model how characters follow each other
# given context in the past, the model will predict what is likely to come next

# nanoGPT
# repository for training transformers on any given text
# 2 files of 300 LoC each
# Model and Trainer

# write this repository from scratch
# Let's go

In [5]:
text = open('tinyshakespeare.txt').read()
len(text)

1115394

In [6]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [8]:
# tokenize
stoi = {ch: i for i,ch in enumerate(chars)}
itos = {i: ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode('hi there'))
print(decode(encode('hi there')))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [9]:
# this is just one way to tokenize
# google sentencepiece (subword)
# hf tokenizers
# openai tiktoken (bpe)
#
# we will keep using char level

In [10]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [11]:
print(data[:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [12]:
# Let's split up the train and validation dataset
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

len(train_data), len(val_data)

(1003854, 111540)

In [13]:
# not going to feed entire text through transformers once
# sample random little chunks and train them
# length of these random little chunks called block_size, also called context length

In [14]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [15]:
# the above has multiple examples packed into it
# we'll make the model simultaneously predict for all the positions
# in chunk of 9 chars, there's 8 individual examples packed in there

In [16]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'{context=}, {target=}')

context=tensor([18]), target=tensor(47)
context=tensor([18, 47]), target=tensor(56)
context=tensor([18, 47, 56]), target=tensor(57)
context=tensor([18, 47, 56, 57]), target=tensor(58)
context=tensor([18, 47, 56, 57, 58]), target=tensor(1)
context=tensor([18, 47, 56, 57, 58,  1]), target=tensor(15)
context=tensor([18, 47, 56, 57, 58,  1, 15]), target=tensor(47)
context=tensor([18, 47, 56, 57, 58,  1, 15, 47]), target=tensor(58)


In [17]:
# not just done for efficiency because we have the examples
# also done to make the transformers used to seeing context of varying length
# be able to be robust both in short as well as long context length

In [18]:
# one more dimension, that is the batch dimension
# done for efficiency, so we can keep the GPUs busy

In [19]:
torch.manual_seed(1337)
batch_size = 4           # how many individual sequences to process in parallel
block_size = 8           # how many maximum individual characters in the example, max context length

def get_batch(split, batch_size=batch_size):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) 
    return x, y

In [20]:
xb, yb = get_batch('train')

print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----------')

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------


In [21]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'{context=}, {target=}')

context=tensor([24]), target=tensor(43)
context=tensor([24, 43]), target=tensor(58)
context=tensor([24, 43, 58]), target=tensor(5)
context=tensor([24, 43, 58,  5]), target=tensor(57)
context=tensor([24, 43, 58,  5, 57]), target=tensor(1)
context=tensor([24, 43, 58,  5, 57,  1]), target=tensor(46)
context=tensor([24, 43, 58,  5, 57,  1, 46]), target=tensor(43)
context=tensor([24, 43, 58,  5, 57,  1, 46, 43]), target=tensor(39)
context=tensor([44]), target=tensor(53)
context=tensor([44, 53]), target=tensor(56)
context=tensor([44, 53, 56]), target=tensor(1)
context=tensor([44, 53, 56,  1]), target=tensor(58)
context=tensor([44, 53, 56,  1, 58]), target=tensor(46)
context=tensor([44, 53, 56,  1, 58, 46]), target=tensor(39)
context=tensor([44, 53, 56,  1, 58, 46, 39]), target=tensor(58)
context=tensor([44, 53, 56,  1, 58, 46, 39, 58]), target=tensor(1)
context=tensor([52]), target=tensor(58)
context=tensor([52, 58]), target=tensor(1)
context=tensor([52, 58,  1]), target=tensor(58)
context=t

In [22]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
