<a href="https://colab.research.google.com/github/zjzhao1002/GPT-from-Scratch/blob/main/GPT_from_Scratch_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Modules

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import sentencepiece as spm

# Hyperparemeters

In [2]:
# Set a random seed
torch.manual_seed(1337)
# How many independent sequences will we process in parallel?
batch_size = 64
# What is the maximum context length for prediction?
block_size = 256
# How many iterations we will be doing in our training loop
max_iters = 5000
# The interval in which we want to calculate the loss. We cannot do that after each step
eval_interval = 500
# The learning rate of the model
learning_rate = 3e-4
# Use GPU to train the model. If GPU is not existed, use the CPU instead.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# The amount of iterations we use in our loss function.
eval_iters = 200
# Number of Embedding dimensions
n_embd = 384
# Number of Heads
n_head = 6
# Number of Block Layers
n_layer = 6
# Dropout
dropout = 0.2

# Tokenizer

In [3]:
with open("/content/questions.txt", 'r', encoding='utf-8') as f:
  text = f.read()

In [4]:
spm.SentencePieceTrainer.Train(
    input = "/content/questions.txt",
    model_prefix = "model_bpe",
    vocab_size = 2000,
    model_type = "bpe"
)

In [5]:
sp = spm.SentencePieceProcessor()
sp.Load("model_bpe.model")

True

In [6]:
vocab_size = sp.GetPieceSize()

In [7]:
encoded_text = sp.EncodeAsIds(text)
data = torch.tensor(encoded_text, dtype=torch.long)

# Split train and test set

In [8]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# Data Loader

In [9]:
def get_batch(split):
    '''
    A function that returns a data batch for training for any given split (train or validation).
    '''
    data = train_data if split == 'train' else val_data
    # Here we create 4 starting indexes (equal to batch_size) of the 4 data batches we want to sample
    index_x = torch.randint(len(data) - block_size, (batch_size,))
    # Get the context data
    x = torch.stack([data[i : i+block_size] for i in index_x])
    # Get our targets
    y = torch.stack([data[i+1 : i+block_size+1] for i in index_x])
    # Move data to device
    x, y = x.to(device), y.to(device)
    return x,y

# Loss Estimater

In [10]:
# This tells torch that it doesn't need to store the intermediate values as we will be doing no backpropagation.
# Saves a lot of memory.
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Self Attention Head

In [11]:
class Head(nn.Module):
    '''A class that represents a single SA head'''

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v=self.value(x)
        out = wei @ v
        return out

# Multi-Head Attention

In [12]:
class MultiHeadAttention(nn.Module):
  '''multiple heads of self-attention in parallel'''

  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(head_size*num_heads, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

# Feed Forward

In [13]:
class FeedForward(nn.Module):
  '''a linear layer followed by a non-linearity'''
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

# Layer Blocks

In [14]:
class Block(nn.Module):
  '''A Transformer block: communication followed by computation'''
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

# Final Model

In [15]:
class GPTLanguageModel(nn.Module):

    def __init__(self):
        print("Created the GPTLanguageModel")
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size) # Language model head

    # This is not a feed forward layer, but gets us the next logits we need for the generate method
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)

        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    # A method that will generate the next token in our timeline. So like:
    # "A ca" -> "A cat" -> "A cat " -> ...
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            # Get the current predictions. This "self" call will automatically call
            # our "forward" method above
            logits, loss = self(idx_cond)
            # focus only on the last time step (token)
            logits = logits[:, - 1, :]
            # Do a softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # Sample from the prob distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # Append the next probable index (=token) to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [16]:
model = GPTLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

Created the GPTLanguageModel
12.276944 M parameters


# Model Training

In [17]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
  if iter % eval_interval == 0 or iter == max_iters - 1:
    losses = estimate_loss(model)
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb, yb = get_batch('train')

  logits, loss = model.forward(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

step 0: train loss 7.7675, val loss 7.7561
step 500: train loss 0.0879, val loss 9.1053
step 1000: train loss 0.0234, val loss 10.1862
step 1500: train loss 0.0177, val loss 10.6859
step 2000: train loss 0.0160, val loss 11.0797
step 2500: train loss 0.0154, val loss 11.3037
step 3000: train loss 0.0150, val loss 11.4353
step 3500: train loss 0.0148, val loss 11.6594
step 4000: train loss 0.0150, val loss 11.8339
step 4500: train loss 0.0147, val loss 11.8958
step 4999: train loss 0.0146, val loss 12.0190


In [18]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(sp.DecodeIds(m.generate(context, max_new_tokens=1000)[0].tolist()))

 ⁇ lazy learner ⁇  or  ⁇ memory based ⁇  model too. KNN relies on the principle that similar data points tend to belong to the same class or have similar target values. This means that, In the training phase, KNN stores the entire dataset consisting of feature vectors and their corresponding class labels (for classification) or target values (for regression). It then calculates the distances between that point and all the points in the training dataset. (commonly used distance metrics are Euclidean distance and Manhattan distance). (Note : Choosing an appropriate value for k is crucial. A small k may result in noisy predictions, while a large k can smooth out the decision boundaries. The choice of distance metric and feature scaling also impact KNN’s performance.) Question: What is the Naïve Bayes algorithm, what are the different assumptions of Naïve Bayes? Answer: The Naïve Bayes algorithm is a probabilistic classification algorithm based on Bayes theorem with a "naïve" assumption of

# Closure

Obviously, there is overfit. The model is trained by a small dataset. The validation dataset is small, and it may contain different topics to the train dataset.

Finally, we can see the model can generate some text. This text makes sense. So we can say that this model works.