In [None]:
import torch as t
import torch.nn as nn
import math
import einops
import gpt_tests
from gpt_sol import UniAttention as UniAttentionSolution
from _gpt_sol import _UnidirectionalAttention, _GPT2Block, _GPT2

from dataclasses import dataclass
from torchtyping import TensorType

import sys
sys.path.append('../w2d1')
from bert_sol import Bert

Checking encoding:
Congrats! You've passed the test!
Checking new key and value:
Congrats! You've passed the test!
Congrats! You've passed the test!
Checking logits:
Congrats! You've passed the test!
Checking final encodings:
Congrats! You've passed the test!
Congrats! Your GPT returns the same results with and without cache.
It took 1.764s to generate a 500-token sentence without cache and 0.559s with cache.


In [None]:
def upper_right_mask(x: t.Tensor):
    return t.tril(x, 0) + (t.triu(t.ones(x.shape), 1) * -1e4)

class UniMultiHeadAttention(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int):
        super().__init__()
        self.attn_ll = nn.Linear(hidden_size, hidden_size*3)
        self.output_ll = nn.Linear(hidden_size, hidden_size)
        self.head_size = hidden_size // num_heads
        self.hidden_size = hidden_size # embedding size
        self.num_heads = num_heads

        
    def forward(self, x: t.Tensor): # [batch, seq_len, hidden_size]
        batch, seq_len, _ = x.shape
        qkv = self.attn_ll(x) # [batch, seq_len, 3 * hidden_size]
        
        q, k, v = einops.rearrange(qkv, 'b s (three e) -> three b e s', three=3) # e for embedding size (which is hidden size)
        #q, k, v = t.split(qkv, self.hidden_size, dim=-1)
        q, k, v = [einops.rearrange(m, 'b (n h) s -> b n h s', n=self.num_heads) for m in (q, k, v)]
        
        raw_score = t.einsum('bnhs,bnhz->bnsz', k, v)
        
        scaled_score = raw_score / math.sqrt(self.head_size)
        
        masked_score = upper_right_mask(scaled_score) 
        
        softmaxed_score = masked_score.softmax(-1) # batch, num_heads, seq_len, seq_len
        
        Z = t.einsum('bnsz,bnhz -> bnhs', softmaxed_score, v)
        Z = einops.rearrange(Z, 'b n h s -> b s (n h)')
        
        output = self.output_ll(Z)
        
        return output
        # WhatWeWant = Z * WO

In [None]:
batch, seq_len, hidden_size = 1, 4, 100
x = t.randn((batch, seq_len, hidden_size))

module = UniMultiHeadAttention(hidden_size, 5)
output = module(x)

out, out2 = gpt_tests.test_unidirectional_attn(UniMultiHeadAttention)
#output

In [None]:
t.allclose(out, out2, atol=1e-4, rtol=1e-4)

False

In [None]:
out.equal(out2)

False

In [None]:
# GPT Block

# The GPT-2 order is: LayerNorm -> Attention -> Residual -> LayerNorm 
# -> Linear[hidden_size, hidden_size * 4] -> GELU -> Linear[hidden_size * 4, hidden_size] -> 
# Dropout -> Residual.

import pdb

class GPTBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, dropout: float, layer_norm_epsilon: float, uni_attn_block=UniAttentionSolution): 
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_size = hidden_size // num_heads
        self.dropout = dropout
        self.layer_n_eps = layer_norm_epsilon
        
        self.layer_norm_1 = nn.LayerNorm(normalized_shape = (self.hidden_size,), eps = self.layer_n_eps)
        self.attention = uni_attn_block(self.hidden_size, self.num_heads)
        self.layer_norm_2 = nn.LayerNorm(normalized_shape = (self.hidden_size,), eps = self.layer_n_eps)
        
        self.MLP = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size * 4),
            nn.GELU(),
            nn.Linear(self.hidden_size * 4, self.hidden_size),
        )
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x: t.Tensor): # [batch, seq_len, hidden_size]
        
        x_norm = self.layer_norm_1(x)
        x_attn = self.attention(x_norm)
        x_resid = x + x_attn
        x_norm2 = self.layer_norm_2(x_resid)
        x_mlp = self.MLP(x_norm2)
        x_dropout = self.dropout(x_mlp)
        output = x_resid + x_dropout 

        return output

num_heads = 5 
dropout = 0.05
# Parameters all screwed
#gpt_block = GPTBlock(hidden_size, num_heads, dropout, layer_norm_epsilon = 1e-5, uni_attn_block=UniAttentionSolution)
#norm_x_attn = gpt_block.forward(x)


In [None]:
def test_gpt_block(GPT2Block):
    kwargs = dict(hidden_size=48, layer_norm_epsilon=1e-4, dropout=0.0, num_heads=4)
    x = t.randn(1, 5, 48)
    
    t.manual_seed(710)
    _block = _GPT2Block(**kwargs)
    _out = _block(x)
    
    t.manual_seed(710)
    block = GPT2Block(**kwargs)
    ours = block(x)
    
    return _out, ours

In [None]:
gpt_tests.test_gpt_block(GPTBlock)


Congrats! You've passed the test!


GPT-2
GPT-2 has the following architecture:
Embedding -> Dropout -> N times GPT-2 Block -> LayerNorm -> Unembedding

The embedding layer takes into account only input_ids and position (i.e. no token_type_id).
We will not have an explicit unembedding layer. Instead, we’ll use the same matrix as in the embedding layer.

https://realpython.com/python-data-classes/

In [None]:
@dataclass
class GPT2Output:
    logits: TensorType["batch_size", "vocab_size"]
    final_encoding: TensorType["batch_size", "hidden_size"]

In [None]:
mem = lambda n=3: t.cuda.memory_allocated() / 2**(10*n)

mem(1)

0.0

In [None]:
mem = lambda n=3: 5*2**30 / 2**(10*n)
mem()

5.0

In [None]:
class GPT2(nn.Module):
    def __init__(self, num_layers, num_heads, vocab_size, hidden_size, max_position_embeddings, dropout, layer_norm_epsilon):
        super().__init__()
        self.dropout = dropout
        self.num_heads = num_heads
        self.norm_eps = layer_norm_epsilon
        self.num_blocks = num_layers
        self.vocab_size = vocab_size
        self.embedding_size = hidden_size
        self.max_position_embeddings = max_position_embeddings
        
        self.token_embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        self.positional_embedding = nn.Embedding(self.max_position_embeddings, self.embedding_size)
        
        self.dropout_layer = nn.Dropout(self.dropout)
        self.GPT_blocks = nn.Sequential(
            *[GPTBlock(self.embedding_size, self.num_heads, self.dropout, self.norm_eps) for _ in range(num_layers)]
        )
        self.layer_norm = nn.LayerNorm( (self.embedding_size,) ,eps = self.norm_eps)
        
    def forward(self, input_ids): # batch, seq_len
        device = input_ids.device
        batch_size, seq_len = input_ids.shape
        
        # Create embeddings
        token_embeddings = self.token_embedding(input_ids) 
        positional_embeddings = self.positional_embedding(t.arange(seq_len).repeat((batch_size, 1)).to(device))
        embedded = token_embeddings + positional_embeddings
        
        dropped_embedded = self.dropout_layer(embedded)
        pre_norm = self.GPT_blocks(dropped_embedded)
        pre_unembedding = self.layer_norm(pre_norm)
        
        self._enc = pre_norm
        
        unembedded = pre_unembedding @ self.token_embedding.weight.T
        return GPT2Output(unembedded[:,-1,:], pre_unembedding[:,-1,:])

`nn.Embedding` layer has two parts:
    - One-hot encoding
    - `nn.Linear`
    
One-hot encoding needs to know the max value of any number in the input tensor
    - Given an input of shape (batch_size, seq_len) -> output of shape (batch_size, seq_len, max_value)
    
`nn.Linear` does the rest
    - (batch_size, seq_len, max_value) -> (batch_size, seq_len, embedding_size)
    - The weight matrix for this layer is what Illustrated GPT2 calls (wte) or (wpe)

In [None]:
config = dict(num_layers=2, num_heads=4, vocab_size=100, hidden_size=64,
              max_position_embeddings=32, dropout=0.0, layer_norm_epsilon=1e-4)

gpt_ours = GPT2(**config)

In [None]:
gpt_tests.test_gpt(GPT2)

Checking logits:
Congrats! You've passed the test!
Checking final encodings:
Congrats! You've passed the test!


In [None]:
my_gpt = GPT2(num_layers=12, num_heads=12, vocab_size=50257, hidden_size=768, max_position_embeddings=1024, dropout=0.1, layer_norm_epsilon=1e-5)

pretrained_gpt, common_tokenizer = gpt_tests.get_pretrained_gpt_and_tokenizer()

In [None]:
# for _ in enumerate(tuple(zip(pretrained_gpt.state_dict(),my_gpt.state_dict()))):
#     print(_)

In [None]:
generated_texts = [
"My life motto:",
"My life motto: Fortune",
"My life motto: Fortune favors",
"My life motto: Fortune favors the",
"My life motto: Fortune favors the bold"
]

In [None]:
#common_tokenizer = pretrained_gpt.tokenizer

In [None]:
#common_tokenizer(text)

In [None]:

def ascii_art_gpt(model, text, k=5, tokenizer=common_tokenizer):
    tokenized_text = common_tokenizer(text)
    preprocessed = t.tensor(tokenized_text['input_ids']).unsqueeze(0)
        
    output = model(preprocessed)
    top_k_list = t.topk(output.logits[0,-1], k).indices.squeeze().tolist()  

    tokens = common_tokenizer.convert_ids_to_tokens(top_k_list)
    return [token.replace('Ġ', '') for token in tokens]

ascii_art_gpt(pretrained_gpt, 'My life motto: Fortune favors the')

['rich', 'wealthy', 'good', 'underdog', 'few']

https://github.com/huggingface/transformers/issues/9362

https://discuss.huggingface.co/t/tokenizer-taking-lot-of-memory/8597

In [None]:
our_bert = Bert(num_layers=12, num_heads=12, vocab_size=50257, hidden_size=768, max_position_embeddings=1024, dropout=0.1, type_vocab_size = 2, intermediate_size = 3072)

In [None]:
input_ids = common_tokenizer(generated_texts)['input_ids']
ten_zeros = [0] * 10

for el in input_ids: 
    remaining_zeros = 10 - len(el)
    for z in range(remaining_zeros):
        el.append(0)
        
final_input_ids = t.Tensor(input_ids).int()

In [None]:
our_bert.eval()(final_input_ids)
our_bert._enc[:,:,3]

tensor([[-0.3063, -1.0202, -1.2014, -0.5042, -0.5798, -1.3563, -1.3400, -0.2594,
         -0.1752, -0.7772],
        [-0.2854, -1.0073, -1.1958, -0.5106, -0.8399, -1.3810, -1.3516, -0.2324,
         -0.1648, -0.7812],
        [-0.1917, -0.9239, -1.1180, -0.4416, -0.7737, -1.4737, -1.3105, -0.1507,
         -0.0885, -0.6993],
        [-0.0911, -0.8879, -1.0305, -0.3618, -0.6880, -1.4176, -1.7094, -0.0575,
          0.0053, -0.6132],
        [-0.1158, -0.9190, -1.0935, -0.4143, -0.7246, -1.4794, -1.7706, -0.2269,
         -0.0032, -0.6541]], grad_fn=<SelectBackward0>)

In [None]:
my_gpt.eval()(final_input_ids)
my_gpt._enc[:,:,3]

tensor([[-1.1982e+00,  7.1118e-01, -1.3565e+00, -1.3015e+00, -2.1480e-01,
          5.3286e-01, -3.6640e-01, -9.0771e-01, -9.0394e-01,  2.4541e-01],
        [-1.1982e+00,  7.1118e-01, -1.3565e+00, -1.3015e+00,  1.3649e+00,
          8.2624e-01, -1.4388e-01, -7.0532e-01, -6.9136e-01,  3.9569e-01],
        [-1.1982e+00,  7.1118e-01, -1.3565e+00, -1.3015e+00,  1.3649e+00,
          1.6376e-04, -9.0492e-02, -6.4807e-01, -5.9624e-01,  4.0160e-01],
        [-1.1982e+00,  7.1118e-01, -1.3565e+00, -1.3015e+00,  1.3649e+00,
          1.6376e-04,  1.3555e+00, -4.7671e-01, -5.1044e-01,  4.3125e-01],
        [-1.1982e+00,  7.1118e-01, -1.3565e+00, -1.3015e+00,  1.3649e+00,
          1.6376e-04,  1.3555e+00,  1.4337e+00, -3.0476e-01,  5.8670e-01]],
       grad_fn=<SelectBackward0>)

In [None]:
# Now in the next iteration, when the model processes the word robot, it does not need to 
# generate query, key, and value queries for the a token. It just reuses the ones it saved from the first iteration:

In [None]:
# class Bert(nn.Module):
#     def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size,
#                  dropout, intermediate_size, num_heads, num_layers):
#         super().__init__()
#         self.embed = BertEmbedding(vocab_size, hidden_size, max_position_embeddings,
#                                    type_vocab_size, dropout)
#         self.blocks = nn.Sequential(*[
#             BertBlock(hidden_size, intermediate_size, num_heads, dropout)
#             for _ in range(num_layers)
#         ])
#         self.lin = nn.Linear(hidden_size, hidden_size)
#         self.layer_norm = nn.LayerNorm(hidden_size)
#         self.unembed = nn.Linear(hidden_size, vocab_size)
        

#     def forward(self, input_ids):
#         token_type_ids = t.zeros_like(input_ids, dtype=int)
#         emb = self.embed(input_ids, token_type_ids)
#         self._enc = enc = self.blocks(emb)
#         enc = self.lin(enc)
#         return self.unembed(self.layer_norm(F.gelu(enc)))
