In [1]:
# Generated using ChatGPT

To code GPT from scratch, the following components are needed:

    Embedding layer: This layer maps the input tokens to high-dimensional vectors, which will be used as input to the transformer layers.

    Transformer layers: The transformer layers consist of self-attention and feedforward neural network modules that process the input tokens.

    Layer normalization: This component is used to normalize the outputs from the transformer layers.

    Positional encoding: This component adds position information to the input tokens, which is essential for the self-attention mechanism to work.

    Linear output layer: This layer is used to project the final transformer layer outputs to the vocabulary size for generating the next token.

    Softmax activation: This component is used to convert the final output scores to a probability distribution over the vocabulary.

    Loss function: The loss function is used to compute the difference between the predicted next token and the ground truth token.

    Optimizer: The optimizer is used to update the model parameters based on the computed loss.

    Data preprocessing: The input data needs to be preprocessed and tokenized to be compatible with the GPT model.

    Training loop: The training loop consists of iterating over the training data, feeding it to the model, computing the loss, and updating the model parameters using the optimizer.



In [None]:
# Embedding layer
import torch.nn as nn

class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self, input_ids):
        return self.embedding(input_ids)

In [None]:
# Transformer layers
import torch
import torch.nn as nn

class TransformerLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super(TransformerLayer, self).__init__()

        # Multi-Head Attention
        self.self_attention = nn.MultiheadAttention(d_model, n_heads)

        # Layer Normalization
        self.layer_norm1 = nn.LayerNorm(d_model)

        # Position-wise Feedforward
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

        # Layer Normalization
        self.layer_norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        # Multi-Head Attention
        attn_output, _ = self.self_attention(x, x, x, attn_mask=mask)

        # Add & Norm
        x = self.layer_norm1(x + attn_output)

        # Position-wise Feedforward
        ff_output = self.feed_forward(x)

        # Add & Norm
        x = self.layer_norm2(x + ff_output)

        return x


In [None]:
# Layer normalization
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        self.eps = eps
        
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta


In [None]:
# Positional encoding
import torch

class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        self.dropout = torch.nn.Dropout(p=0.1)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


In [None]:
# Linear layer
"""
Here, the GPTLinearLayer class takes in the embedding_size (i.e., the size of the
embeddings output by the final transformer layer) and vocab_size (i.e., the size of
the vocabulary). Inside the forward method, we first flatten the batch_size and
seq_length dimensions of the input tensor using the view method. We then pass the
flattened tensor through a Linear layer (self.fc), which has output size equal to
vocab_size. Finally, we reshape the output back to the original shape of
(batch_size, seq_length, vocab_size) using the view method again.
"""
import torch
import torch.nn as nn

class GPTLinearLayer(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super(GPTLinearLayer, self).__init__()
        self.fc = nn.Linear(embedding_size, vocab_size)

    def forward(self, x):
        # x has shape (batch_size, seq_length, embedding_size)
        # Flatten the batch_size and seq_length dimensions for the linear layer
        x = x.view(-1, x.size(-1))
        # Pass through the linear layer
        x = self.fc(x)
        # Reshape back to (batch_size, seq_length, vocab_size)
        x = x.view(-1, x.size(1), self.fc.out_features)
        return x