In [6]:
import torch
import torch.nn as nn # All neural network components, nn.Linear, nn.Conv2d, BatchNorm, Loss functions, Activation functions
from typing import List, Tuple

In [7]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024, # max number of input tokens the model can handle via the positional embeddings
    "embedding_size": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

## 1. A placeholder GPT model architecture class

In [8]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__() # Call the __init__ of the parent class
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["embedding_size"]) 
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["embedding_size"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.transformer_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])] # * is used to unpack the list
        )
        self.final_norm = DummyLayerNorm(cfg["embedding_size"])
        self.out_head = nn.Linear(cfg["embedding_size"], cfg["vocab_size"], bias=False)

    def forward(self, input_idx):
        batch_size, seq_len = input_idx.shape
        tok_embeds = self.tok_emb(input_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len, # Indices for the positional embeddings
            device=input_idx.device) # Ensure the tensor generated by torch.arange is on the same device as input_idx
         ) 
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x) # logits are the raw, unnormalized scores or outputs of the final layer before the activation function is applied
        return logits

class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x
    
class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()

    def forward(self, x):
        return x

## 2. High-level overview of data flow in a GPT model

In [9]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
tx1 = "Every effort moves you"
tx2  = "Every day holds a"

print(tokenizer.encode(tx1))

[6109, 3626, 6100, 345]


In [10]:
batch.append(torch.tensor(tokenizer.encode(tx1)))
batch.append(torch.tensor(tokenizer.encode(tx2)))
batch = torch.stack(batch, dim=0)
print(batch)
print(batch.shape)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
torch.Size([2, 4])


In [11]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


## 3. Layer Normalization

In [12]:
# Example
torch.manual_seed(123)
batch_sample = torch.randn(2,5)
print(batch_sample)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU()) # A linear layer followed by a ReLU activation function
out = layer(batch_sample)
print(out)

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [13]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [14]:
out_norm = (out - mean) / torch.sqrt(var) # Normalizing the output by subtracting the mean and dividing by the standard deviation
normalized_mean = out_norm.mean(dim=-1, keepdim=True)
nomarlized_var = out_norm.var(dim=-1, keepdim=True)
torch.set_printoptions(sci_mode=False)
print("Mean:\n", normalized_mean)
print("Variance:\n", nomarlized_var)

Mean:
 tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [15]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5 # Epsilon value to avoid division by zero
        self.scale = nn.Parameter(torch.ones(emb_dim)) # Trainable scale parameter, initialized to 1
        self.shift = nn.Parameter(torch.zeros(emb_dim)) # Trainable shift parameter, initialized to 0

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True) # x shape is (batch_size, seq_len, emb_dim), so we calculate the mean along the last dimension - the embedding dimension
        var = x.var(dim=-1, keepdim=True, unbiased=False) # unbiased=False means the variance is calculated with N instead of N-1
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return norm_x * self.scale + self.shift

In [16]:
nn.Parameter(torch.ones(5)), nn.Parameter(torch.zeros(5))

(Parameter containing:
 tensor([1., 1., 1., 1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0., 0., 0.], requires_grad=True))

In [17]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_sample) # Call the forward method of the LayerNorm instance
print(out_ln)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1,unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

tensor([[ 0.5528,  1.0693, -0.0223,  0.2656, -1.8654],
        [ 0.9087, -1.3767, -0.9564,  1.1304,  0.2940]], grad_fn=<AddBackward0>)
Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


## 4. Batch Normalization

In [18]:
import torch.nn.functional as F

class BatchNorm1dCustom(nn.Module):
    def __init__(self, num_features, epsilon=1e-5, momentum=0.1):
        super(BatchNorm1dCustom, self).__init__()
        
        # Initialize parameters
        self.num_features = num_features
        self.epsilon = epsilon
        self.momentum = momentum
        
        # Learnable parameters (gamma and beta)
        self.gamma = nn.Parameter(torch.ones(num_features))  # scale factor
        self.beta = nn.Parameter(torch.zeros(num_features))  # shift factor
        
        # Running mean and variance for inference phase
        self.running_mean = torch.zeros(num_features)
        self.running_var = torch.ones(num_features)

    def forward(self, x):
        # Calculate the mean and variance of the current batch
        batch_mean = x.mean(dim=0)
        batch_var = x.var(dim=0, unbiased=False)
        
        # Normalize the input using batch statistics
        x_normalized = (x - batch_mean) / torch.sqrt(batch_var + self.epsilon)
        
        # Scale and shift
        out = self.gamma * x_normalized + self.beta
        
        # Update running statistics for inference
        # During training, we use the statistics of the current batch (mean and variance) to normalize the data.
        # During inference, we don't have a batch of data to compute the mean and variance, 
        # so we use the running statistics accumulated during training to normalize the input data. 
        # This helps in ensuring that the model behaves consistently during inference, 
        # even when the batch size may be different or a single sample is passed through.

        # self.momentum is a hyperparameter that controls how quickly the running statistics adapt to new data. 
        # It determines how much weight is given to the current batch's statistics versus the accumulated statistics
        self.running_mean = self.momentum * batch_mean + (1 - self.momentum) * self.running_mean # Cumulative mean across all batches
        self.running_var = self.momentum * batch_var + (1 - self.momentum) * self.running_var # Cumulative variance across all batches
        
        return out
    
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(10, 50)
        self.bn1 = BatchNorm1dCustom(50)  # Apply BatchNorm after fc1
        self.fc2 = nn.Linear(50, 10)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))  # Pass through FC and BatchNorm
        x = self.fc2(x)
        return x

# Example usage
x = torch.randn(32, 10)  # A batch of 32 samples, each with 10 features
model = SimpleNet()
output = model(x)
print(output.shape)

torch.Size([32, 10])


## 5. GELU

In [19]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh((torch.sqrt(torch.tensor(2 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3)))))

In [20]:
x = torch.tensor([0.0], requires_grad=True)
output = torch.relu(x)
print(output)
output.backward()

print(x.grad) # Output: tensor([0.])

tensor([0.], grad_fn=<ReluBackward0>)
tensor([0.])


In [21]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["embedding_size"], 4*cfg["embedding_size"]),
            GELU(), # Apply the GELU activation function to the output of the linear layer
            nn.Linear(4*cfg["embedding_size"], cfg["embedding_size"])
        )

    def forward(self, x):
        return self.layers(x)

In [22]:
ffn = FeedForward(GPT_CONFIG_124M)
x = torch.rand(2, 3, 768)
out = ffn(x)
print(out.shape)

torch.Size([2, 3, 768])


## 6. Adding shortcut connections

In [45]:
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes: List, use_shortcut: bool)-> None:
        super().__init__()
        self.use_shortcut= use_shortcut
        self.layers = nn.ModuleList(
            [nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU()),
        ])

    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x)
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x
    
def print_gradients(model, x):
    x = x.float()
    output = model(x)
    target = torch.tensor([[0.0]])

    loss = nn.MSELoss()
    loss = loss(output, target)
    loss.backward() # Compute the loss gradient for each layer in the model

    for name, param in model.named_parameters(): # Iterate over all the parameters in the model
        if "weight" in name:
            print(f'{name} has gradient mean of {param.grad.abs().mean().item()}') # item() is used to get the value of a single-element tensor as a Python number

In [46]:
layer_sizes = [3,3,3,3,3,1]
sample_input = torch.tensor([[1,0,-1]])
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=False)


In [47]:
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.00020173587836325169
layers.1.0.weight has gradient mean of 0.0001201116101583466
layers.2.0.weight has gradient mean of 0.0007152041653171182
layers.3.0.weight has gradient mean of 0.001398873864673078
layers.4.0.weight has gradient mean of 0.005049646366387606


In [49]:
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=True)
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.22169792652130127
layers.1.0.weight has gradient mean of 0.20694106817245483
layers.2.0.weight has gradient mean of 0.32896995544433594
layers.3.0.weight has gradient mean of 0.2665732502937317
layers.4.0.weight has gradient mean of 1.3258541822433472


## 7. Transformer block

In [55]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads) == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)) # diagonal=1 refers to the diagonal above the main diagonal

    def forward(self, x):
        bz, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        keys = keys.view(bz, num_tokens, self.num_heads, self.head_dim) # Split the matrix by adding a num_heads dimension
        queries = queries.view(bz, num_tokens, self.num_heads, self.head_dim)
        values = values.view(bz, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1,2) # Transpose from shape (bz, num_tokens, num_heads, head_dim) to (bz, num_heads, num_tokens, head_dim)
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)
        # print("keys shape:", keys.shape)
        # print("queries shape:", queries.shape)
        # print("keys transpose shape:", keys.transpose(2, 3).shape)
        attn_scores = queries @ keys.transpose(2, 3) # Perform the matrix multiplication bwteen the last two dimensions for each head
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights@values).transpose(1,2) # Transpose back to (bz, num_tokens, num_heads, head_dim)
        context_vec = context_vec.contiguous().view(bz, num_tokens, self.d_out) # Combine the num_heads and head_dim dimensions
        # Contiguous() is used to ensure that the tensor is stored in a contiguous block of memory
        context_vec = self.out_proj(context_vec) # Apply the output projection to ensure that the output has the disired output dimensionality
        return context_vec
    
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg["embedding_size"],
            d_out = cfg["embedding_size"],
            context_length= cfg["context_length"],
            num_heads= cfg["n_heads"],
            dropout= cfg["drop_rate"],
            qkv_bias= cfg["qkv_bias"],
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["embedding_size"])
        self.norm2 = LayerNorm(cfg["embedding_size"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x # Save the input for the shortcut connection
        x = self.norm1(x)
        x = self.att(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # Add the original input back

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x

In [None]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768) # [batch_size, num_tokens, emb_dim]
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

print("Output shape:", output.shape) # Transformer architecture processes sequences of data without altering their shape

Output shape: torch.Size([2, 4, 768])


## 8.GPT model architecture

In [60]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["embedding_size"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["embedding_size"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])] # * is used to unpack the list
        )

        self.final_norm = LayerNorm(cfg["embedding_size"])
        self.out_head = nn.Linear(cfg["embedding_size"], cfg["vocab_size"], bias=False) # The output head is a linear layer that maps the final hidden states to the vocabulary size

    def forward(self, in_idx): # in_idx is the input tensor of token indices
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device) # Ensure the tensor generated by torch.arange is on the same device as in_idx
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [62]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:",  out.shape)
print("\nOutput logits:\n", out)


Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])

Output logits:
 tensor([[[-0.3908, -0.0394, -0.2787,  ..., -0.1345, -0.4530, -0.0452],
         [ 0.5414, -0.5809, -0.6548,  ..., -0.6122, -0.4137, -0.0453],
         [ 0.7071,  0.3419, -0.3609,  ...,  0.1560, -0.7911, -0.0799],
         [-0.8427,  0.4923, -0.3785,  ...,  1.0679,  0.2109, -0.3187]],

        [[ 0.0035, -0.0981, -0.0746,  ..., -0.2712, -0.1676, -0.1018],
         [ 0.2211,  0.0195,  0.2330,  ...,  0.8125, -0.3979,  0.3369],
         [ 0.7438,  0.7255, -0.2684,  ...,  0.4249,  0.0383, -0.1436],
         [-0.3017,  0.1331,  0.2273,  ...,  1.1209, -0.5624,  0.0315]]],
       grad_fn=<UnsafeViewBackward0>)


In [64]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params: ,}")

Total number of parameters:  163,009,536


In [65]:
# Why is the actual number of parameters 163 million, instead of 124 million?
# Reason: weight tying - the input and output embeddings are tied, meaning they share the same weights.

print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output head layer shape:", model.out_head.weight.shape)


Token embedding layer shape: torch.Size([50257, 768])
Output head layer shape: torch.Size([50257, 768])


In [66]:
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Total number of parameters in GPT-2: {total_params_gpt2:,}")

Total number of parameters in GPT-2: 124,412,160


In [68]:
## Exercise
total_params_ff = sum(p.numel() for trf_block in model.trf_blocks for p in trf_block.ff.parameters())
print(f"Total number of parameters in FeedForward: {total_params_ff:,}")
total_params_att = sum(p.numel() for trf_block in model.trf_blocks for p in trf_block.att.parameters())
print(f"Total number of parameters in MultiHeadAttention: {total_params_att:,}")

Total number of parameters in FeedForward: 56,669,184
Total number of parameters in MultiHeadAttention: 28,320,768


In [69]:
## Correct answer
block = TransformerBlock(GPT_CONFIG_124M)
total_params_ff = sum(p.numel() for p in block.ff.parameters())
total_params_att = sum(p.numel() for p in block.att.parameters())
print(f"Total number of parameters in FeedForward: {total_params_ff:,}")
print(f"Total number of parameters in MultiHeadAttention: {total_params_att:,}")

Total number of parameters in FeedForward: 4,722,432
Total number of parameters in MultiHeadAttention: 2,360,064


### Compute the memory requirements

In [None]:
# 1 byte = 8 bits
# 32 bits = 4 bytes
# 1 parameter = 32 bits (float32)
# 1 parameter = 4 bytes (float32)
# 1 MB = 1024 * 1024 bytes
total_size_bytes = total_params * 4 # Assuming 4 bytes per parameter (float32)
total_size_mb = total_size_bytes / (1024 * 1024) # Convert to megabytes
print(f"Total size of the model (float32): {total_size_mb:.2f} MB")

Total size of the model (float32): 621.83 MB


In [72]:
# 1 parameter = 2 bytes (float16)
# 1 MB = 1024 * 1024 bytes
total_size_bytes = total_params * 2 # Assuming 2 bytes per parameter (float16)
total_size_mb = total_size_bytes / (1024 * 1024) # Convert to megabytes
print(f"Total size of the model (float16): {total_size_mb:.2f} MB")

Total size of the model (float16): 310.92 MB


In [75]:
1.3 * 10**9 * 4 / (1024 * 1024* 1024) # 1.3 billion parameters


4.842877388000488

In [78]:
GPT_CONFIG_MEDIUM = {
    "vocab_size": 50257,
    "context_length": 1024, # max number of input tokens the model can handle via the positional embeddings
    "embedding_size": 1024,
    "n_heads": 16,
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

gpt2_medium = GPTModel(GPT_CONFIG_MEDIUM)
total_params_medium = sum(p.numel() for p in gpt2_medium.parameters())
print(f"Total number of parameters in GPT-2 Medium: {total_params_medium:,}")

GPT_CONFIG_LARGE = {
    "vocab_size": 50257,
    "context_length": 1024, # max number of input tokens the model can handle via the positional embeddings
    "embedding_size": 1280,
    "n_heads": 20,
    "n_layers": 36,
    "drop_rate": 0.1,
    "qkv_bias": False,
}
gpt2_large = GPTModel(GPT_CONFIG_LARGE)
total_params_large = sum(p.numel() for p in gpt2_large.parameters())
print(f"Total number of parameters in GPT-2 Large: {total_params_large:,}")

GPT_CONFIG_XL = {
    "vocab_size": 50257,
    "context_length": 2048, # max number of input tokens the model can handle via the positional embeddings
    "embedding_size": 1600,
    "n_heads": 25,
    "n_layers": 48,
    "drop_rate": 0.1,
    "qkv_bias": False,
}
gpt2_xl = GPTModel(GPT_CONFIG_XL)
total_params_xl = sum(p.numel() for p in gpt2_xl.parameters())
print(f"Total number of parameters in GPT-2 XL: {total_params_xl:,}")
trainable_params_consider_weightTying = total_params_xl - sum(p.numel() for p in gpt2_xl.out_head.parameters())
print(f"Total number of trainable parameters in GPT-2 XL (considering weight tying): {trainable_params_consider_weightTying:,}")
model_xl_size = trainable_params_consider_weightTying * 4 / (1024 * 1024 * 1024)
print(f"Total size of the model (float32): {model_xl_size:.2f} GB")

Total number of parameters in GPT-2 Medium: 406,212,608
Total number of parameters in GPT-2 Large: 838,220,800
Total number of parameters in GPT-2 XL: 1,639,430,400
Total number of trainable parameters in GPT-2 XL (considering weight tying): 1,559,019,200
Total size of the model (float32): 5.81 GB


## Generating text

In [79]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is a (batch, n_tokens) tensor of indices in the current context
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:] # Crops the current context if it exceeds the context size
        with torch.no_grad(): # Disable gradient computation
            logits = model(idx_cond)

        logits = logits[:, -1, :] # Select the last token's logits, so that (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        probas = torch.softmax(logits, dim=-1) # Apply softmax to get the probabilities
        idx_next = torch.argmax(probas, dim=-1, keepdim=True) # idx_next has shape (batch, 1)
        idx = torch.cat((idx, idx_next), dim=1) # Append the new token to the running sequence

    return idx

In [81]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add a batch dimension
print("encoded_tensor shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor shape: torch.Size([1, 4])


In [83]:
model.eval() # Disable dropout and set the model to evaluation mode
out = generate_text_simple(model=model, idx=encoded_tensor, max_new_tokens=6, context_size=GPT_CONFIG_124M["context_length"])
print("Output:", out)
print("Output length:", len(out[0]))
print("Decoded:", tokenizer.decode(out[0].tolist()))

Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Output length: 10
Decoded: Hello, I am Featureiman Byeswickattribute argue
