<a href="https://colab.research.google.com/github/wwucla/gpt1/blob/main/gpt1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro
This colab is largely inherent from [Andrej Karpathy's colab](https://colab.research.google.com/drive/1JMLa53HDuA-i7ZBmqV7ZnA3c_fvtXnx-?usp=sharing#scrollTo=wJpXpmjEYC_T).

Instead of having dev/test code in Colab and build prod code in Python binary (to run on a GPU server), I am putting all code in Colab due to lack of a workstation with GPU.

# Code Setup (Import, Configs, Dataset download)

In [1]:
# @title Imports

!pip install tiktoken
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
from enum import Enum

class LoggingLevel(Enum):
    NO_LOGGING = 1 # No logging unless code failed
    CONCISE = 2 # Logging critical information such as training/validation metrics
    VERBOSE = 3 # Logging debugging information


Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [17]:
# @title Global Configs And Hparams
batch_size = 64
block_size = 256 # max context length for prediction
n_embd = 384
n_layer = 6
n_head = 6
dropout = 0.2
max_iters = 5000 # debugging only, update to 3000 to reproduce Karpathy's version
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu' # unused at this moment
eval_iters = 200

torch.manual_seed(1337)
logging_level = LoggingLevel.VERBOSE

def print_verbose(*argv):
  if(logging_level == logging_level.VERBOSE):
    print(*argv)

def print_concise(*argv):
  if(logging_level in [logging_level.VERBOSE, logging_level.CONCISE]):
    print(*argv)

print_concise('device:', device)

device: cuda


In [3]:
# @title Download and read Dataset into `text`
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

--2024-11-01 07:19:46--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-11-01 07:19:46 (107 MB/s) - ‘input.txt’ saved [1115394/1115394]



# "Tokenization" and Model training

In [4]:
# All unique chars
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create "tokenizer" (unique chars), mapping unique chars to integers
stoi = {ch : i for i, ch in enumerate(chars)}
itos = {i : ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]  # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

data = torch.tensor(encode(text), dtype = torch.long)
n = int(0.9 * len(data)) # first 90% for train, rest 10% for val
train_data = data[:n]
val_data = data[n:]

In [16]:
from math import sqrt
from re import VERBOSE
# @title Training loops (in batches）

def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split== 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y


@torch.no_grad()
def estimate_loss(model):
  """
  Estimate train and val losses and train model for one iteration
  """
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split) # (B, T), (B, T)
      logits, loss = model(X, Y) # (B*T, C), 1
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out


class AttentionHead(nn.Module):
  """Single head attention"""

  def __init__(self, head_size):
    super().__init__()
    self.query = nn.Linear(n_embd, head_size, bias = False)
    self.key = nn.Linear(n_embd, head_size, bias = False)
    self.value = nn.Linear(n_embd, head_size, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,H = x.shape
    k = self.key(x) # (B, T, H)
    q = self.query(x) # (B, T, H)
    v = self.value(x) # (B, T, H)
    att_score = q @ k.transpose(-2, -1) # (B, T, H) @ (B, H, T) --> (B, T, T)

    att_score = att_score.masked_fill(self.tril[:T, :T]==0, float('-inf')) # (B, T, T)
    att_score = F.softmax(att_score / sqrt(H), dim = -1)
    att_score = self.dropout(att_score)
    out = att_score @ v # (B, T, T) @ (B, T, H) --> (B, T, H)
    return out


class MultiHeadAttention(nn.Module):

  def __init__(self, n_head, head_size):
    super().__init__()
    self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(n_head)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    x = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(x)
    return self.dropout(out)


class FeedForward(nn.Module):
  """Linear layer followed by ReLU """

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd,4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)


class DecoderBlock(nn.Module):

  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ff = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ff(self.ln2(x))
    return x


# Bigram language model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd) # learnable position encoding
    # n_head = 4
    # self.sa = AttentionHead(head_size = n_embd)
    # self.mha = MultiHeadAttention(n_head, n_embd//n_head)
    # self.ff = FeedForward(n_embd)
    # self.decoders = nn.Sequential(
    #     DecoderBlock(n_embd, n_head=4),
    #     DecoderBlock(n_embd, n_head=4),
    #     DecoderBlock(n_embd, n_head=4),
    #     nn.LayerNorm(n_embd),
    # )
    self.decoders = nn.Sequential(*[DecoderBlock(n_embd, n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    """
    Forward pass of the model.
    Input: idx - B, T
    Input (optional): targets - B, T
    Returns:
      logits - B, T, C (batch, time, channels)
      loss - scalar
    """
    # idx and targets are both (B,T) tensor of targets
    B, T = idx.shape

    tok_emb = self.token_embedding_table(idx) # (B, T, C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    x = tok_emb + pos_emb # (B, T, C)

    # x = self.sa(x) # (B, T, C)
    # x = self.mha(x) # (B, T, C)
    # x = self.ff(x) # (B, T, C)
    x = self.decoders(x)

    logits = self.lm_head(x) # (B, T, vocab_size)

    # cross_entropy expects input shape to be (N,C), and target shape to be (N)
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T)) # reduced to scalar

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # crop idx to the last block_size tokens (like a sliding window)
      idx_cond = idx[:, -block_size:]
      # get predicted logits
      logits, _ = self(idx_cond) # (B, T, C)
      # get the logits of the last timestamp
      logits = logits[:, -1, :] # (B, C)
      # applys softmax to get prob
      prob = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(prob, num_samples=1) # (B, 1)
      # append sampled idx to the running sequence
      idx = torch.cat([idx, idx_next], dim=-1) # (B, T+1)
    return idx


model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

for iter in range(max_iters):

  # every once in a while, evaluate the loss on train and val sets
  if iter % eval_interval == 0:
    losses = estimate_loss(m)
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  optimizer.zero_grad(set_to_none=True)
  logits, loss = model(xb, yb)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.5945, val loss 4.5843
step 500: train loss 1.7777, val loss 1.9304
step 1000: train loss 1.4576, val loss 1.6745
step 1500: train loss 1.3231, val loss 1.5745
step 2000: train loss 1.2399, val loss 1.5276
step 2500: train loss 1.1800, val loss 1.5212
step 3000: train loss 1.1199, val loss 1.5220
step 3500: train loss 1.0633, val loss 1.5195
step 4000: train loss 1.0078, val loss 1.5434
step 4500: train loss 0.9479, val loss 1.5714

Your worships to havine seen thing:
Your kinsman, why should he did fol them.

BRUTUS:
Take how then virtue look upon this infingerty means
Be venued my salverity, and am I
Willing willingly. Go: I hear, old tempert in
Aime: for my soul's burnt your own way,
Send love and your feitherward is sufficient.

First Senator:
Welcome.

MENENIUS:
This is a kind age,
To unan men have look'd on's back'd with Caius Volscians' delives?

Second Servingman:
We had rather same their powers:
Strike like us func


In [45]:
# @title train for more iterations (don't run on large network, too slow)
# for iter in range(10000):

#   # every once in a while, evaluate the loss on train and val sets
#   if iter % eval_interval == 0:
#     losses = estimate_loss(m)
#     print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

#   # sample a batch of data
#   xb, yb = get_batch('train')

#   # evaluate the loss
#   optimizer.zero_grad(set_to_none=True)
#   logits, loss = model(xb, yb)
#   loss.backward()
#   optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 2.2571, val loss 2.2846
step 500: train loss 2.2584, val loss 2.2648
step 1000: train loss 2.2318, val loss 2.2642
step 1500: train loss 2.2387, val loss 2.2619
step 2000: train loss 2.2186, val loss 2.2422
step 2500: train loss 2.2152, val loss 2.2525
step 3000: train loss 2.2088, val loss 2.2236
step 3500: train loss 2.2133, val loss 2.2317
step 4000: train loss 2.1962, val loss 2.2380
step 4500: train loss 2.1978, val loss 2.2349
step 5000: train loss 2.1936, val loss 2.2457
step 5500: train loss 2.1898, val loss 2.2195
step 6000: train loss 2.2012, val loss 2.2250
step 6500: train loss 2.1954, val loss 2.2216
step 7000: train loss 2.1854, val loss 2.2184
step 7500: train loss 2.1675, val loss 2.2101
step 8000: train loss 2.1725, val loss 2.2153
step 8500: train loss 2.1605, val loss 2.2154
step 9000: train loss 2.1605, val loss 2.2160
step 9500: train loss 2.1617, val loss 2.2115

Kink;
Ted ppalent licill your hinoth.

RINGNETROD:
THod bereey userle ambeged onoty

In [23]:
# @title debugging code for self-attention block
B, T, C = 4, 8, 32
x = torch.randn(B,T,C)

head_size = C
query = nn.Linear(C, head_size, bias = False)
key = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)
k = key(x) # (B, T, C)
q = query(x) # (B, T, C)
v = value(x) # (B, T, C)
att_score = q @ k.transpose(-2, -1) # (B, T, C) @ (B, C, T) --> (B, T, T)

tril = torch.tril(torch.ones(T, T))
att_score = att_score.masked_fill(tril==0, float('-inf'))
att_score = F.softmax(att_score / sqrt(block_size), dim = -1)
out = att_score @ v # (B, T, T) @ (B, T, C) --> (B, T, C)
print(out.shape)

# out = F.relu(out)

torch.Size([4, 8, 32])


In [None]:
# @title Debugging code: Enums, Dataset basics, Vocab, "Tokenizers"(char vs tiktokenizer), tokenzied data (context vs labels)

# Some Enum basics
print_verbose(LoggingLevel.CONCISE)
print_verbose(LoggingLevel.CONCISE.name)
print_verbose(LoggingLevel.CONCISE.value)
print_verbose(type(LoggingLevel.CONCISE))
print_verbose(repr(LoggingLevel.CONCISE))
print_verbose(list(LoggingLevel))

print_verbose("length of dataset in characters: ", len(text))
print_verbose(text[:1000])

# all unique characters that occur in this text
print_verbose(''.join(chars))
print_verbose(vocab_size)

# tokenizer and reversed tokenizer
print_verbose(encode('Shakespeare!!!'))
print_verbose(decode((encode('Shakespeare!!!'))))

## test code: using gpt2 tokenizer
enc = tiktoken.get_encoding('gpt2')
print_verbose(enc.n_vocab)
print_verbose(enc.encode('Hii there'))

# First 100 "tokens"
print_verbose(data.shape, data.dtype)
print_verbose(data[:100])

# Context vs labels in training data
block_size = 8
train_data[:block_size+1]
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print_verbose(f"when input is {context} the target: {target}")