## GPT Frame


In [16]:
from importlib.metadata import version

# print("matplotlib version:", version("matplotlib"))
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.4.0
tiktoken version: 0.5.1


In [17]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "drop_rate_emb": 0.1,
    "drop_rate_ShortCut": 0.1,
    "drop_rate_mha": 0.1,
    "qkv_bias": False
}

In [18]:
import tiktoken
import torch


tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [19]:
import torch
import torch.nn as nn

# for GPT usage module

# ________________For MHA 
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias = False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out//num_heads
        self.w_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1,2)
        values = values.transpose(1,2)
        queries = queries.transpose(1,2)

        attn_scores = queries @ keys.transpose(2,3)
        mask_bool= self.mask.bool() [:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1,2)
        context_vec = context_vec.contiguous().view(b,num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec
# ____________ For Transformer Black
class LayerNorm(nn.Module): # 層歸一化類別
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim = True)
        var = x.var(dim=-1, keepdim = True, unbiased = False)
        norm_x = (x-mean)/ torch.sqrt(var + self.eps)
        return self.scale*norm_x + self.shift # 讓模型能縮放跟偏移，以適應正在處理的資料
    
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5* x * (1+torch.tanh(
            torch.sqrt(torch.tensor(2.0/ torch.pi)) * (x +0.044715 * torch.pow(x, 3))
        ))
class FeedForward(nn.Module): # 前饋神經網路模組
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4*cfg["emb_dim"]), 
            GELU(),
            nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"]), 
        )
    def forward(self, x):
        return self.layers(x)

# ___________
class TransformerBlock(nn.Module): #有包含殘差學習的一個多頭注意力機制和前饋神經網路  
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out = cfg["emb_dim"],
            context_length= cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate_mha"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1=LayerNorm(cfg["emb_dim"])
        self.norm2=LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate_ShortCut"])
        
    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # 殘差學習

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)       
        x = x + shortcut
        return x


In [20]:

# GPT Architecture

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate_emb"])

        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias = False
        )
    
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)

        pos_embeds = self.pos_emb(
            torch.arange(seq_len, device = in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits= self.out_head(x)
        return logits

In [21]:
torch.manual_seed(123)

# batch = torch.rand(2, 4, 768)
model = GPTModel(GPT_CONFIG_124M)

out = model(batch)
print(batch)
print(out.shape)
print(out)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)


## HW4.1  

In [22]:

tfb = TransformerBlock(GPT_CONFIG_124M)
torch.manual_seed(123)
print(tfb)


mha_parameter = sum(m_p.numel() for m_p in tfb.att.parameters())
ffn_parameter = sum(f_p.numel() for f_p in tfb.ff.parameters())

print(f"Total parameter of MHA:{mha_parameter:,}")
print(f"Total parameter of ffn:{ffn_parameter:,}")

TransformerBlock(
  (att): MultiHeadAttention(
    (w_query): Linear(in_features=768, out_features=768, bias=False)
    (w_key): Linear(in_features=768, out_features=768, bias=False)
    (w_value): Linear(in_features=768, out_features=768, bias=False)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ff): FeedForward(
    (layers): Sequential(
      (0): Linear(in_features=768, out_features=3072, bias=True)
      (1): GELU()
      (2): Linear(in_features=3072, out_features=768, bias=True)
    )
  )
  (norm1): LayerNorm()
  (norm2): LayerNorm()
  (drop_shortcut): Dropout(p=0.1, inplace=False)
)
Total parameter of MHA:2,360,064
Total parameter of ffn:4,722,432


## HW4.2

New GPT model config

In [41]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate_emb": 0.1,
    "drop_rate_ShortCut": 0.1,
    "drop_rate_mha": 0.1,
    "qkv_bias": False
}

def get_config(base_config, config_name  = "GPT_CONFIG_samll"):
    GPT_config = base_config.copy()
    if config_name == "GPT_CONFIG_samll":
        GPT_config['emb_dim'] = 768
        GPT_config['n_heads'] = 12
        GPT_config['n_layers'] = 12
        return GPT_config
    elif config_name == "GPT_CONFIG_medium":
        GPT_config['emb_dim'] = 1024
        GPT_config['n_heads'] = 16
        GPT_config['n_layers'] = 24
        return GPT_config
    elif config_name == "GPT_CONFIG_large":
        GPT_config['emb_dim'] = 1280
        GPT_config['n_heads'] = 20
        GPT_config['n_layers'] = 36
        return GPT_config
    elif config_name == "GPT_CONFIG_XL":
        GPT_config['emb_dim'] = 1600
        GPT_config['n_heads'] = 25
        GPT_config['n_layers'] = 48
        return GPT_config
    else:
        print("Out of range.")
        return GPT_config
        

def caculate_parameter(model, config_name ):
    model_parameter = sum(p.numel() for p in model.parameters())
    model_parameter_final = model_parameter - sum(p.numel() for p in model.out_head.parameters())
    model_memory = model_parameter*4/ (1024*1024)

    print("Total ",config_name,f" parameter:{model_parameter:,}")
    print(f" Total GPT model_memory:{model_memory:,.2f} MB")

list_gpt = ["GPT_CONFIG_samll", "GPT_CONFIG_medium", "GPT_CONFIG_large", "GPT_CONFIG_XL"]
base_config = GPT_CONFIG_124M


for gpt_models in list_gpt:

    pgt_config = get_config(base_config, gpt_models)
    gpt_base_config = GPTModel(pgt_config)
    caculate_parameter(gpt_base_config, gpt_models)

Total  GPT_CONFIG_samll  parameter:163,009,536
 Total GPT model_memory:621.83 MB
Total  GPT_CONFIG_medium  parameter:406,212,608
 Total GPT model_memory:1,549.58 MB
Total  GPT_CONFIG_large  parameter:838,220,800
 Total GPT model_memory:3,197.56 MB
Total  GPT_CONFIG_XL  parameter:1,637,792,000
 Total GPT model_memory:6,247.68 MB


In [32]:
GPT_CONFIG_medium = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,
    "n_heads": 16,
    "n_layers": 24,
    "drop_rate_emb": 0.1,
    "drop_rate_ShortCut": 0.1,
    "drop_rate_mha": 0.1,
    "qkv_bias": False
}

In [34]:
torch.manual_seed(123)

# batch = torch.rand(2, 4, 768)
model = GPTModel(GPT_CONFIG_medium)

model_parameter = sum(p.numel() for p in model.parameters())
model_parameter_final = model_parameter - sum(p.numel() for p in model.out_head.parameters())
model_memory = model_parameter*4/ (1024*1024)

print(f"Total GPT_CONFIG_medium parameter:{model_parameter:,}")
print(f" Total GPT model_memory:{model_memory:,.2f} MB")

Total GPT_CONFIG_medium parameter:406,212,608
 Total GPT model_memory:1,549.58 MB


In [25]:
GPT_CONFIG_large = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1280,
    "n_heads": 20,
    "n_layers": 36,
    "drop_rate_emb": 0.1,
    "drop_rate_ShortCut": 0.1,
    "drop_rate_mha": 0.1,
    "qkv_bias": False
}

In [26]:
torch.manual_seed(123)

# batch = torch.rand(2, 4, 768)
model = GPTModel(GPT_CONFIG_large)

model_parameter = sum(p.numel() for p in model.parameters())

print(f"Total GPT_CONFIG_large parameter of ffn:{model_parameter:,}")

Total GPT_CONFIG_large parameter of ffn:838,220,800


In [27]:
GPT_CONFIG_XL = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1600,
    "n_heads": 25,
    "n_layers": 48,
    "drop_rate_emb": 0.1,
    "drop_rate_ShortCut": 0.1,
    "drop_rate_mha": 0.1,
    "qkv_bias": False
}

In [28]:
torch.manual_seed(123)

# batch = torch.rand(2, 4, 768)
model = GPTModel(GPT_CONFIG_XL)

model_parameter = sum(p.numel() for p in model.parameters())

print(f"Total GPT_CONFIG_XL parameter of ffn:{model_parameter:,}")

Total GPT_CONFIG_XL parameter of ffn:1,637,792,000
