In [1]:
from importlib.metadata import version

pkgs = [
    "torch", "transformers"
]

for pkg in pkgs:
    print(f"{pkg}: {version(pkg)}")

torch: 2.4.1
transformers: 4.45.2


In [2]:
import math
import yaml

from pydantic import BaseModel
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
class ModelConfig(BaseModel):
    hf_model_name: str = ""
    vocab_size: int = 50257
    max_seq_len: int = 1024
    embed_dim: int = 768
    num_heads: int = 12
    num_layers: int = 12
    attn_dropout: float = 0.1
    resid_dropout: float = 0.1
    embed_dropout: float = 0.1

# read in the config.yaml file
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

model_config = ModelConfig(**config)
model_config

ModelConfig(hf_model_name='', vocab_size=50257, max_seq_len=1024, embed_dim=768, num_heads=12, num_layers=12, attn_dropout=0.1, resid_dropout=0.1, embed_dropout=0.1)

## Multihead (Causal) Self-Attention

In [4]:
class CausalSelfAttention(nn.Module):
    """
    Causal self-attention layer, masking the future tokens.
    """
    def __init__(self, cfg):
        super().__init__()
        self.num_heads = cfg.num_heads
        self.q_proj = nn.Linear(cfg.embed_dim, cfg.embed_dim)
        self.k_proj = nn.Linear(cfg.embed_dim, cfg.embed_dim)
        self.v_proj = nn.Linear(cfg.embed_dim, cfg.embed_dim)
        self.out_proj = nn.Linear(cfg.embed_dim, cfg.embed_dim)

        self.attn_dropout = nn.Dropout(0.1)
        self.resid_dropout = nn.Dropout(0.1)
        
        # Create a bias tensor to prevent attention to future tokens
        mask = torch.tril(torch.ones(cfg.max_seq_len, cfg.max_seq_len))
        self.register_buffer(
            'mask', (mask == 0).view(1, 1, cfg.max_seq_len, cfg.max_seq_len)
        )
        # mask will be a tensor like the following:
        # tensor([[[[False, True,  True,  ...,  True],
        #           [False, False, True,  ...,  True],
        #           [False, False, False, ...,  True],
        #           ...,
        #           [False, False, False, ..., False]]]])
        # where True values indicate that the token should be masked
        # i.e., replaced with -inf in the attention scores
        
    def forward(self, x):
        # Apply linear transformations to get queries, keys, and values
        # x: [B, T, C]
        q = self.q_proj(x)
        k = self.k_proj(x)
        v = self.v_proj(x)
        # q,k,v: [B, T, C]

        # Split the queries, keys, and values into multiple heads
        B, T, C = q.size()
        q = q.view(B, T, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
        k = k.view(B, T, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
        v = v.view(B, T, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
        # q,k,v: [B, nh, T, C//nh]
        
        # Calculate attention scores
        scores = torch.matmul(q, k.permute(0, 1, 3, 2))
        scores = scores / (math.sqrt(k.size(-1)))
        scores.masked_fill_(self.mask[:, :, :T, :T], -torch.inf)
        # scores: [B, nh, T, T]
        
        # Apply softmax to get attention weights
        attn_weights = F.softmax(scores, dim=-1)
        # attn_weights: [B, nh, T, T]

        attn_weights = self.attn_dropout(attn_weights)
        
        # Multiply attention weights with values
        out = torch.matmul(attn_weights, v)
        # out: [B, nh, T, C//nh]

        # Concatenate the heads and apply a linear transformation
        out = out.permute(0, 2, 1, 3).contiguous().view(B, T, C)
        out = self.out_proj(out)
        # out: [B, T, C]

        out = self.resid_dropout(out)
        
        return out

# testing
cfg = ModelConfig(
    vocab_size=100,
    max_seq_len=10,
    embed_dim=32,
    num_heads=8,
    num_layers=2,
    attn_dropout=0.1,
    resid_dropout=0.1,
    hidden_dropout=0.1
)
x = torch.randn(2, 5, cfg.embed_dim)
mha = CausalSelfAttention(cfg)
print(mha)

out = mha(x)
print("\nOutput:", out.size())  # torch.Size([2, 5, 10])

CausalSelfAttention(
  (q_proj): Linear(in_features=32, out_features=32, bias=True)
  (k_proj): Linear(in_features=32, out_features=32, bias=True)
  (v_proj): Linear(in_features=32, out_features=32, bias=True)
  (out_proj): Linear(in_features=32, out_features=32, bias=True)
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)

Output: torch.Size([2, 5, 32])


## Feed-Forward Network (FFN)

In [5]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        embed_dim = cfg.embed_dim
        hidden_dim = cfg.embed_dim * 4
        p_drop = cfg.resid_dropout
        # Two linear layers with activation in between
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, embed_dim)
        self.gelu = nn.GELU(approximate='tanh')
        self.resid_dropout = nn.Dropout(p_drop)

    def forward(self, x):            # [B, T, C]
        x = self.gelu(self.fc1(x))  # [B, T, 2C]
        x = self.fc2(x)              # [B, T, C]
        x = self.resid_dropout(x)

        return x

# testing
ffn = FeedForwardNetwork(cfg)
print(ffn)
x = torch.randn(2, 5, cfg.embed_dim)
out = ffn(x)
print("\nOutput:", out.size())

FeedForwardNetwork(
  (fc1): Linear(in_features=32, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=32, bias=True)
  (gelu): GELU(approximate='tanh')
  (resid_dropout): Dropout(p=0.1, inplace=False)
)

Output: torch.Size([2, 5, 32])


## Transformer Block

In [6]:
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.mha = CausalSelfAttention(config)
        self.ln1 = nn.LayerNorm(config.embed_dim)
        self.ffn = FeedForwardNetwork(config)
        self.ln2 = nn.LayerNorm(config.embed_dim)

        self.resid_dropout = nn.Dropout(config.resid_dropout)

    def forward(self, x):
        # Apply self-attention and add residual connection
        shortcut = x
        x = self.ln1(x)
        x = self.mha(x)[0]
        x = self.resid_dropout(x)
        x = shortcut + x

        # Apply feedforward network and add residual connection
        shortcut = x
        x = self.ln2(x)
        x = self.ffn(x)
        x = self.resid_dropout(x)
        x = shortcut + x

        return x
    
# testing
transformer_block = TransformerBlock(cfg)
print(transformer_block)
x = torch.randn(2, 5, cfg.embed_dim)
out = transformer_block(x)
print("\nInput:", x.size())
print("Output:", out.size())

TransformerBlock(
  (mha): CausalSelfAttention(
    (q_proj): Linear(in_features=32, out_features=32, bias=True)
    (k_proj): Linear(in_features=32, out_features=32, bias=True)
    (v_proj): Linear(in_features=32, out_features=32, bias=True)
    (out_proj): Linear(in_features=32, out_features=32, bias=True)
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  (ffn): FeedForwardNetwork(
    (fc1): Linear(in_features=32, out_features=128, bias=True)
    (fc2): Linear(in_features=128, out_features=32, bias=True)
    (gelu): GELU(approximate='tanh')
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)

Input: torch.Size([2, 5, 32])
Output: torch.Size([2, 5, 32])


## GPT2

In [21]:
class GPT2(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.embed_dim
        vocab_size = config.vocab_size
        context_length = config.max_seq_len
        self.num_layers = config.num_layers

        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(context_length, embed_dim)
        self.embed_dropout = nn.Dropout(config.embed_dropout)

        self.layers = nn.ModuleList([
            TransformerBlock(config) for _ in range(self.num_layers)
        ])

        self.ln_final = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size, bias=False)

        # weight sharing -> saving 40M parameters
        self.head.weight = self.token_emb.weight

    def forward(self, idx):
        # Generate token embeddings
        tok_emb = self.token_emb(idx)
        # Generate position embeddings
        pos = torch.arange(idx.size(1), device=idx.device).unsqueeze(0)
        pos_emb = self.pos_emb(pos)
        x = self.embed_dropout(tok_emb + pos_emb)

        # Apply the transformer blocks
        for layer in self.layers:
            x = layer(x)

        # Apply the final layer norm
        x = self.ln_final(x)

        # Generate logits
        logits = self.head(x)

        return logits
    
# testing
gpt2 = GPT2(cfg)
print(gpt2)
x = torch.randint(0, 100, (2, 5))
out = gpt2(x)
print("\nInput:", x.size())
print("Output:", out.size())

GPT2(
  (token_emb): Embedding(100, 32)
  (pos_emb): Embedding(10, 32)
  (embed_dropout): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0-1): 2 x TransformerBlock(
      (mha): CausalSelfAttention(
        (q_proj): Linear(in_features=32, out_features=32, bias=True)
        (k_proj): Linear(in_features=32, out_features=32, bias=True)
        (v_proj): Linear(in_features=32, out_features=32, bias=True)
        (out_proj): Linear(in_features=32, out_features=32, bias=True)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (ffn): FeedForwardNetwork(
        (fc1): Linear(in_features=32, out_features=128, bias=True)
        (fc2): Linear(in_features=128, out_features=32, bias=True)
        (gelu): GELU(approximate='tanh')
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln2): LayerNorm((32,), eps=1e-05, elementwise_affi

## Loading HuggingFace 🤗 weights

In [9]:
from transformers import GPT2Model

gpt2_hf = GPT2Model.from_pretrained("gpt2")
print(gpt2_hf)

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2SdpaAttention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)


In [10]:
from pprint import pprint
pprint(list(gpt2_hf.state_dict().keys()))

['wte.weight',
 'wpe.weight',
 'h.0.ln_1.weight',
 'h.0.ln_1.bias',
 'h.0.attn.c_attn.weight',
 'h.0.attn.c_attn.bias',
 'h.0.attn.c_proj.weight',
 'h.0.attn.c_proj.bias',
 'h.0.ln_2.weight',
 'h.0.ln_2.bias',
 'h.0.mlp.c_fc.weight',
 'h.0.mlp.c_fc.bias',
 'h.0.mlp.c_proj.weight',
 'h.0.mlp.c_proj.bias',
 'h.1.ln_1.weight',
 'h.1.ln_1.bias',
 'h.1.attn.c_attn.weight',
 'h.1.attn.c_attn.bias',
 'h.1.attn.c_proj.weight',
 'h.1.attn.c_proj.bias',
 'h.1.ln_2.weight',
 'h.1.ln_2.bias',
 'h.1.mlp.c_fc.weight',
 'h.1.mlp.c_fc.bias',
 'h.1.mlp.c_proj.weight',
 'h.1.mlp.c_proj.bias',
 'h.2.ln_1.weight',
 'h.2.ln_1.bias',
 'h.2.attn.c_attn.weight',
 'h.2.attn.c_attn.bias',
 'h.2.attn.c_proj.weight',
 'h.2.attn.c_proj.bias',
 'h.2.ln_2.weight',
 'h.2.ln_2.bias',
 'h.2.mlp.c_fc.weight',
 'h.2.mlp.c_fc.bias',
 'h.2.mlp.c_proj.weight',
 'h.2.mlp.c_proj.bias',
 'h.3.ln_1.weight',
 'h.3.ln_1.bias',
 'h.3.attn.c_attn.weight',
 'h.3.attn.c_attn.bias',
 'h.3.attn.c_proj.weight',
 'h.3.attn.c_proj.bias',


In [11]:
def load_hf_weights(model, hf_model):
    # Load the weights from the Hugging Face model
    hf_dict = hf_model.state_dict()

    # assign emnbedding weights
    model.token_emb.weight.data.copy_(hf_dict['wte.weight'])
    model.pos_emb.weight.data.copy_(hf_dict['wpe.weight'])

    # assign transformer block weights
    for idx, layer in enumerate(model.layers):
        # MHA weights and biases
        qkv_weight = hf_dict[f'h.{idx}.attn.c_attn.weight']
        qkv_bias = hf_dict[f'h.{idx}.attn.c_attn.bias']
        q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=1)
        q_bias, k_bias, v_bias = qkv_bias.chunk(3)
        layer.mha.q_proj.weight.data.copy_(q_weight.T)
        layer.mha.q_proj.bias.data.copy_(q_bias)
        layer.mha.k_proj.weight.data.copy_(k_weight.T)
        layer.mha.k_proj.bias.data.copy_(k_bias)
        layer.mha.v_proj.weight.data.copy_(v_weight.T)
        layer.mha.v_proj.bias.data.copy_(v_bias)
        # MHA out projection weights and biases
        c_weight = hf_dict[f'h.{idx}.attn.c_proj.weight']
        c_bias = hf_dict[f'h.{idx}.attn.c_proj.bias']
        layer.mha.out_proj.weight.data.copy_(c_weight.T)
        layer.mha.out_proj.bias.data.copy_(c_bias)
        # Layer norm weights and biases
        layer.ln1.weight.data.copy_(hf_dict[f'h.{idx}.ln_1.weight'])
        layer.ln1.bias.data.copy_(hf_dict[f'h.{idx}.ln_1.bias'])
        layer.ln2.weight.data.copy_(hf_dict[f'h.{idx}.ln_2.weight'])
        layer.ln2.bias.data.copy_(hf_dict[f'h.{idx}.ln_2.bias'])
        # FFN weights and biases
        layer.ffn.fc1.weight.data.copy_(hf_dict[f'h.{idx}.mlp.c_fc.weight'].T)
        layer.ffn.fc1.bias.data.copy_(hf_dict[f'h.{idx}.mlp.c_fc.bias'])
        layer.ffn.fc2.weight.data.copy_(hf_dict[f'h.{idx}.mlp.c_proj.weight'].T)
        layer.ffn.fc2.bias.data.copy_(hf_dict[f'h.{idx}.mlp.c_proj.bias'])

    # assign final layer norm weights
    model.ln_final.weight.data.copy_(hf_dict['ln_f.weight'])
    model.ln_final.bias.data.copy_(hf_dict['ln_f.bias'])
    # assign head weights (wte)
    #model.head.weight.data.copy_(hf_dict['wte.weight'])


In [20]:
gpt2 = GPT2(model_config)

# count number of parameters
num_params = sum(p.numel() for p in gpt2.parameters())
print(f"Number of parameters: {num_params}")

load_hf_weights(gpt2, gpt2_hf)

Number of parameters: 163037184


## Generating text

In [14]:
def generate_text(
        model,
        tokenizer,
        prompt,
        max_len=100,
        temperature=1.0,
    ):
    model.eval()
    prompt = tokenizer.encode(prompt)
    prompt = torch.tensor(prompt).unsqueeze(0)
    generated = prompt
    with torch.no_grad():
        for _ in range(max_len):
            logits = model(generated)
            logits = logits[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)

    return tokenizer.decode(generated[0].tolist())

# testing
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
torch.manual_seed(123)

prompt = "Hello, I'm a language model,"
text = generate_text(gpt2, tokenizer, prompt, max_len=20)
print(text)

Hello, I'm a language model, not rendering an object. Though later in the installation process my code will probably try to render the map


## Save model

In [15]:
torch.save(gpt2.state_dict(), "gpt2.pth")