<a href="https://colab.research.google.com/github/wwucla/gpt1/blob/main/gpt1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro
This colab is largely inherent from [Andrej Karpathy's colab](https://colab.research.google.com/drive/1JMLa53HDuA-i7ZBmqV7ZnA3c_fvtXnx-?usp=sharing#scrollTo=wJpXpmjEYC_T).

Instead of having dev/test code in Colab and build prod code in Python binary (to run on a GPU server), I am putting all code in Colab due to lack of a workstation with GPU.

# Code Setup (Import, Configs, Dataset download)

In [1]:
# @title Imports

!pip install tiktoken
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
from enum import Enum

class LoggingLevel(Enum):
    NO_LOGGING = 1 # No logging unless code failed
    CONCISE = 2 # Logging critical information such as training/validation metrics
    VERBOSE = 3 # Logging debugging information


Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [63]:
# @title Global Configs And Hparams
batch_size = 4
block_size = 8 # max context length for prediction
max_iters = 3000 # debugging only, update to 3000 to reproduce Karpathy's version
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu' # unused at this moment
eval_iters = 200

torch.manual_seed(1337)
logging_level = LoggingLevel.VERBOSE

def print_verbose(*argv):
  if(logging_level == logging_level.VERBOSE):
    print(*argv)

def print_concise(*argv):
  if(logging_level in [logging_level.VERBOSE, logging_level.CONCISE]):
    print(*argv)


In [3]:
# @title Download and read Dataset into `text`
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

--2024-10-31 22:07:52--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-10-31 22:07:52 (18.1 MB/s) - ‘input.txt’ saved [1115394/1115394]



# "Tokenization" and Model training

In [4]:
# All unique chars
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create "tokenizer" (unique chars), mapping unique chars to integers
stoi = {ch : i for i, ch in enumerate(chars)}
itos = {i : ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]  # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

data = torch.tensor(encode(text), dtype = torch.long)
n = int(0.9 * len(data)) # first 90% for train, rest 10% for val
train_data = data[:n]
val_data = data[n:]

In [70]:
from re import VERBOSE
# @title Training loops (in batches）

def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split== 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y


@torch.no_grad()
def estimate_loss(model):
  """
  Estimate train and val losses and train model for one iteration
  """
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split) # (B, T), (B, T)
      logits, loss = model(X, Y) # (B*T, C), 1
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out


# Bigram language model
class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    """
    Forward pass of the model.
    Input: idx - B, T
    Input (optional): targets - B, T
    Returns:
      logits - B, T, C (batch, time, channels)
      loss - scalar
    """
    # idx and targets are both (B,T) tensor of targets
    logits = self.token_embedding_table(idx) # (B, T, C)
    # cross_entropy expexts input shape to be (N,C), and target shape to be (N)
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T)) # reduced to scalar

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # get predicted logits
      logits, _ = self(idx) # (B, T, C)
      # get the logits of the last timestamp
      logits = logits[:, -1, :] # (B, C)
      # applys softmax to get prob
      prob = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(prob, num_samples=1) # (B, 1)
      # append sampled idx to the running sequence
      idx = torch.cat([idx, idx_next], dim=-1) # (B, T+1)
    return idx


model = BigramLanguageModel(vocab_size)
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

for iter in range(max_iters):

  # every once in a while, evaluate the loss on train and val sets
  if iter % eval_interval == 0:
    losses = estimate_loss(m)
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  optimizer.zero_grad(set_to_none=True)
  logits, loss = model(xb, yb)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.5775, val loss 4.5764
step 300: train loss 4.3537, val loss 4.3620
step 600: train loss 4.1556, val loss 4.1595
step 900: train loss 3.9995, val loss 3.9646
step 1200: train loss 3.8202, val loss 3.8394
step 1500: train loss 3.6709, val loss 3.6695
step 1800: train loss 3.5300, val loss 3.5627
step 2100: train loss 3.4351, val loss 3.4251
step 2400: train loss 3.3093, val loss 3.3311
step 2700: train loss 3.2238, val loss 3.2158

AsGq?GVxEX&
UKNAfxsntru;imXnzWWccuei;tULkR:O;yor!'CHOsen.
SEOhdorvjoYArc&hy,; he,w' h;k.
LIX!wE&STIszdye$Bqc
RCd x-hoxtbvyat?d tOa alleiLieCuFq;-kT?$bcot an-bjxv&FcChle w'th -AR:RRvkVPLq;ofNronTssRieweisel&PDg, yye d m,FfrfGWP!f s?y;Sr;
sqFIpupu
RW?&-pX;.
EYcaHyn:s tEofxethJ, g-
YBhdyanfvk te?LExI-ivind BlllUY
LEyeanttyGcXjN:VPvKGcyeenNop -lar'APDk:Knd mirnd ghIOWfUDibptho lAelndE&X'shItiCLymi-AFl$D:PD:stOVG onwhifZXJThip OzdAEndc;&
akLqY;Ste hEHIUWNPtorsehdHsucJ.
S,LDu
ScGKfKLZXesI:MqZeEd.
W' m


In [None]:
# @title Debugging code: variable shapes in model inference

# xb: B T C
torch.manual_seed(1337)
max_new_tokens = 10
xb, yb = get_batch('train')
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
# print_verbose(logits.shape)
# print_verbose(loss)

# # generate text
# context_idx = torch.zeros((1,1), dtype=torch.long)
# generated = m.generate(context_idx, max_new_tokens)[0].tolist()
# print_verbose(generated)
# print_verbose(decode(generated))

# # test context with 2 batches
context_idx = torch.zeros((2,1), dtype=torch.long)
generated0 = m.generate(context_idx, max_new_tokens)[0].tolist()
generated1 = m.generate(context_idx, max_new_tokens)[1].tolist()
print_verbose("generated text0: ", decode(generated0))
print_verbose("generated text1: ", decode(generated1))

# # Degbugging code: print out context & labels in batches
# xb, yb = get_batch('train')
# print_verbose('inputs:', xb.shape)
# print_verbose(xb)
# print_verbose('targets', yb.shape)
# print_verbose(yb)
# print_verbose('----')

# for b in range(batch_size): # batch dim
#   for t in range(block_size): # time dim
#     context = xb[b, :t+1]
#     target = yb[b, t]
#     print(f"when input is {context.tolist()} the target: {target}")

In [23]:
# @title Debugging code: Enums, Dataset basics, Vocab, "Tokenizers"(char vs tiktokenizer), tokenzied data (context vs labels)

# Some Enum basics
print_verbose(LoggingLevel.CONCISE)
print_verbose(LoggingLevel.CONCISE.name)
print_verbose(LoggingLevel.CONCISE.value)
print_verbose(type(LoggingLevel.CONCISE))
print_verbose(repr(LoggingLevel.CONCISE))
print_verbose(list(LoggingLevel))

print_verbose("length of dataset in characters: ", len(text))
print_verbose(text[:1000])

# all unique characters that occur in this text
print_verbose(''.join(chars))
print_verbose(vocab_size)

# tokenizer and reversed tokenizer
print_verbose(encode('Shakespeare!!!'))
print_verbose(decode((encode('Shakespeare!!!'))))

## test code: using gpt2 tokenizer
enc = tiktoken.get_encoding('gpt2')
print_verbose(enc.n_vocab)
print_verbose(enc.encode('Hii there'))

# First 100 "tokens"
print_verbose(data.shape, data.dtype)
print_verbose(data[:100])

# Context vs labels in training data
block_size = 8
train_data[:block_size+1]
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print_verbose(f"when input is {context} the target: {target}")

LoggingLevel.CONCISE
CONCISE
2
<enum 'LoggingLevel'>
<LoggingLevel.CONCISE: 2>
[<LoggingLevel.NO_LOGGING: 1>, <LoggingLevel.CONCISE: 2>, <LoggingLevel.VERBOSE: 3>]
length of dataset in characters:  1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventor