GPT-2 implementation in Jupyter Notebook
based on https://jaykmody.com/blog/gpt-from-scratch

at first we need an encoder, hyperparameters and pretrained weights for our gpt-2 implementation:

In [1]:
from utils import load_encoder_hparams_and_params
encoder, hparams, params = load_encoder_hparams_and_params("124M", "models")

In [2]:
import numpy as np

# GELU activation function (look https://arxiv.org/pdf/1606.08415.pdf for more information)
def gelu(x): 
    return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))

# Numerically stable version of SoftMax function (copied from https://jaykmody.com/blog/stable-softmax/)
def softmax(x): 
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# For GPT-2 architecture we need specific function for normalization with scale and offset effects based on params
def layer_norm(x, g, b, eps: float = 1e-5): 
    mean = np.mean(x, axis=-1, keepdims=True)
    variance = np.var(x, axis=-1, keepdims=True)
    x = (x - mean) / np.sqrt(variance + eps)  # normalize x to have mean=0 and var=1 over last axis
    return g * x + b  # scale and offset with gamma/beta params

# Linear Projection Function: standard matrix multiplication + bias
def linear(x, w, b):  # [m, in], [in, out], [out] -> [m, out]
    return x @ w + b

# Feed Forward Network - perceptron with 2 layers (GELU + projection from n_embd dimension to a higher dimension 4*n_embd and then back down to n_embd
def ffn(x, c_fc, c_proj):  # [n_seq, n_embd] -> [n_seq, n_embd]
    # GELU + project up
    a = gelu(linear(x, **c_fc))  # [n_seq, n_embd] -> [n_seq, 4*n_embd]

    # project back down
    x = linear(a, **c_proj)  # [n_seq, 4*n_embd] -> [n_seq, n_embd]

    return x

# Mask generation function - to prevent all the queries in our input from looking into the future, we need matrix sustainable to modify our attention matrix to hide future tokens
def get_mask(n_seq):
    return (1 - np.tri(n_seq)) * -1e10 # for n_seq*n_seq attention matrix we will get n_seq*n_seq mask matrix filled with 0 and -1e10

# Attention function (from https://jaykmody.com/blog/attention-intuition/)
def attention(q, k, v, mask):  # [n_q, d_k], [n_k, d_k], [n_k, d_v], [n_q, n_k] -> [n_q, d_v]
    # assumes q is a matrix of shape (n_q, d_k)
    # assumes k is a matrix of shape (n_k, d_k)
    # assumes v is a matrix of shape (n_k, d_v)
    # output is a matrix of shape (n_q, d_v)
    return softmax(q @ k.T / np.sqrt(q.shape[-1]) + mask) @ v


# Multi-Head Self Attention implementation - we are performing n_head separate attention computations, splitting our queries, keys, and values into heads and on each head we perform such steps: linear projections, applying mask, letting our input sequence attend to itself, bias vectors addition and out projection 
def mha(x, c_attn, c_proj, n_head):  # [n_seq, n_embd] -> [n_seq, n_embd]
    # qkv(querry,key,value) linear projections
    x = linear(x, **c_attn)  # [n_seq, n_embd] -> [n_seq, 3*n_embd]

    # split into qkv
    qkv = np.split(x, 3, axis=-1)  # [n_seq, 3*n_embd] -> [3, n_seq, n_embd]

    # split into heads
    qkv_heads = list(map(lambda x: np.split(x, n_head, axis=-1), qkv))  # [3, n_seq, n_embd] -> [3, n_head, n_seq, n_embd/n_head]

    # causal mask to hide future inputs from being attended to
    causal_mask =  get_mask(x.shape[0])  # [n_seq, n_seq]

    # perform causal self attention over each head
    out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)]  # [3, n_head, n_seq, n_embd/n_head] -> [n_head, n_seq, n_embd/n_head]

    # merge heads
    x = np.hstack(out_heads)  # [n_head, n_seq, n_embd/n_head] -> [n_seq, n_embd]

    # out projection + bias
    x = linear(x, **c_proj)  # [n_seq, n_embd] -> [n_seq, n_embd]

    return x

# Transformer decoder block wich consists of two sublayers
def transformer_block(x, mlp, attn, ln_1, ln_2, n_head):  # [n_seq, n_embd] -> [n_seq, n_embd]
    # multi-head causal self attention
    x = x + mha(layer_norm(x, **ln_1), **attn, n_head=n_head)  # [n_seq, n_embd] -> [n_seq, n_embd]

    # position-wise feed forward network
    x = x + ffn(layer_norm(x, **ln_2), **mlp)  # [n_seq, n_embd] -> [n_seq, n_embd]

    return x

# GPT-2 architecture putted together
def gpt2(inputs, wte, wpe, blocks, ln_f, n_head):  # [n_seq] -> [n_seq, n_vocab]
    # token + positional embeddings
    x = wte[inputs] + wpe[range(len(inputs))]  # [n_seq] -> [n_seq, n_embd]
    # x[i] represents the word embedding for the ith word + the positional
    # embedding for the ith position

    # forward pass through n_layer transformer blocks
    for block in blocks:
        x = transformer_block(x, **block, n_head=n_head)  # [n_seq, n_embd] -> [n_seq, n_embd]

    # LM head
    # projection to vocab
    x = layer_norm(x, **ln_f)  # [n_seq, n_embd] -> [n_seq, n_embd]
    return x @ wte.T  # [n_seq, n_embd] -> [n_seq, n_vocab]

# Function for generation of output based on prompt
def generate(inputs, params, n_head, n_tokens_to_generate):
    from tqdm import tqdm

    for _ in tqdm(range(n_tokens_to_generate), "generating"):  # auto-regressive decode loop
        logits = gpt2(inputs, **params, n_head=n_head)  # model forward pass
        next_id = np.argmax(logits[-1])  # greedy sampling
        inputs.append(int(next_id))  # append prediction to input

    return inputs[len(inputs) - n_tokens_to_generate :]  # only return generated ids



In [5]:
# so we need some prompt to test our GPT-2
prompt = 'For millennia, the main use of ferrets was for hunting, or \"ferreting\". With their long, lean build, and inquisitive nature, ferrets are very well equipped for getting down holes and chasing rodents, rabbits and moles out of their burrows.'

# and we need to specify amount of tokens we want GPT to generate
n_tokens_to_generate = 25

# encode the input string using the encoder (BPE tokenizer)
input_ids = encoder.encode(prompt)

# make sure we are not surpassing the max sequence length of our model
assert len(input_ids) + n_tokens_to_generate < hparams["n_ctx"]

# generate output ids
output_ids = generate(input_ids, params, hparams["n_head"], n_tokens_to_generate)

# decode the ids back into a string
output_text = encoder.decode(output_ids)

generating: 100%|██████████| 25/25 [00:19<00:00,  1.26it/s]

prompt: For millennia, the main use of ferrets was for hunting, or "ferreting". With their long, lean build, and inquisitive nature, ferrets are very well equipped for getting down holes and chasing rodents, rabbits and moles out of their burrows.
GPT-2: 

Ferrets are also very good at hunting small mammals, such as rabbits, and are very good at hunting large mammals





In [9]:
# lets see what we got
print('\nprompt:\n' + prompt)
print('\nGPT-2:\n' + output_text)


prompt:
For millennia, the main use of ferrets was for hunting, or "ferreting". With their long, lean build, and inquisitive nature, ferrets are very well equipped for getting down holes and chasing rodents, rabbits and moles out of their burrows.

GPT-2:


Ferrets are also very good at hunting small mammals, such as rabbits, and are very good at hunting large mammals


In [6]:
output_text

'\n\nFerrets are also very good at hunting small mammals, such as rabbits, and are very good at hunting large mammals'