To code GPT from scratch, the following components are needed:

    Tokenizer: To preprocess the text data and convert it into tokenized form that can be fed to the model.

    Embedding layer: This layer maps the input tokens to high-dimensional vectors, which will be used as input to the transformer layers.

    Transformer layers: The transformer layers consist of self-attention and feedforward neural network modules that process the input tokens.

    Layer normalization: This component is used to normalize the outputs from the transformer layers.

    Positional encoding: This component adds position information to the input tokens, which is essential for the self-attention mechanism to work.

    Linear output layer: This layer is used to project the final transformer layer outputs to the vocabulary size for generating the next token.

    Softmax activation: This component is used to convert the final output scores to a probability distribution over the vocabulary.

    Loss function: The loss function is used to compute the difference between the predicted next token and the ground truth token.

    Optimizer: The optimizer is used to update the model parameters based on the computed loss.

    Data preprocessing: The input data needs to be preprocessed and tokenized to be compatible with the GPT model.

    Training loop: The training loop consists of iterating over the training data, feeding it to the model, computing the loss, and updating the model parameters using the optimizer.



In [None]:
# Tokeniser
import re
from collections import Counter

class GPTTokenizer:
    def __init__(self, data, vocab_size=10000, unk_token="<UNK>"):
        """
        unk_token: is used to specify the token to be used to represent
                   any out-of-vocabulary (OOV) words that are not present
                   in the vocabulary
        """
        self.vocab_size = vocab_size
        self.unk_token = unk_token
        self.word_freq = Counter()
        self.word_to_idx = {}
        self.idx_to_word = {}
        
        # Preprocess the text data
        self.preprocessed_data = self._preprocess(data)
        
        # Build the vocabulary
        self._build_vocab()
    
    def _preprocess(self, data):
        # Convert to lowercase and remove special characters
        data = data.lower()
        data = re.sub(r"[^a-zA-Z0-9]+", " ", data)
        return data
    
    def _build_vocab(self):
        # Count the frequency of each word in the data
        for word in self.preprocessed_data.split():
            self.word_freq[word] += 1
        
        # Sort the words by frequency and select the top vocab_size words
        sorted_words = sorted(self.word_freq.items(), key=lambda x: x[1], reverse=True)
        top_words = sorted_words[:self.vocab_size-1]
        
        # Add the unk_token to the vocabulary
        self.word_to_idx[self.unk_token] = 0
        self.idx_to_word[0] = self.unk_token
        
        # Add the top_words to the vocabulary
        for i, (word, freq) in enumerate(top_words):
            self.word_to_idx[word] = i+1
            self.idx_to_word[i+1] = word
    
    def tokenize(self, text):
        # Tokenize the text and convert each token to its index in the vocabulary
        tokens = []
        for word in text.split():
            if word in self.word_to_idx:
                tokens.append(self.word_to_idx[word])
            else:
                tokens.append(self.word_to_idx[self.unk_token])
        return tokens

"""
data = "This is some example text for testing the GPT tokenizer."
tokenizer = GPTTokenizer(data, vocab_size=10000)
# Tokenize a sentence
sentence = "This is a test sentence."
tokens = tokenizer.tokenize(sentence)
print(tokens)
# Output: [10, 11, 2, 1322]
"""

In [None]:
# Embedding layer
import torch.nn as nn

class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        """
        vocab_size: number of unique words in our embedding space
        embed_dim: each word would be associated with a learnable vector of
                   this size
        """
        super(EmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)

In [None]:
# Transformer layers
import torch
import torch.nn as nn

class TransformerLayer(nn.Module):
    def __init__(self, embed_dim, n_heads, d_ff):
        """
        embed_dim: refers to the dimension of the embedding space into which
                   the input tokens are projected. In other words, each token
                   in the input sequence is represented as a vector of length
                   embed_dim.
                   
                   The choice of embed_dim is important as it affects the
                   model's ability to capture the semantics and relationships
                   between the tokens. If embed_dim is too small, the model
                   may not have enough capacity to represent the input tokens
                   accurately, resulting in poor performance. On the other
                   hand, if embed_dim is too large, the model may overfit
                   to the training data and fail to generalize well to new
                   inputs.
                   
                   In practice, the choice of embed_dim is often a trade-off
                   between model capacity and computational efficiency. A
                   common practice is to set embed_dim to a value between
                   100 and 1000, depending on the complexity of the task and
                   the size of the dataset.
         d_ff: determines the size of this feedforward layer. Larger values
               of d_ff will allow for more complex functions to be learned,
               but also increase the number of parameters in the model and the
               computation required during training and inference
        """
        super(TransformerLayer, self).__init__()

        # Multi-Head Attention
        self.self_attention = nn.MultiheadAttention(embed_dim, n_heads)

        # Layer Normalization
        self.layer_norm1 = nn.LayerNorm(embed_dim)

        # Position-wise Feedforward
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, embed_dim)
        )

        # Layer Normalization
        self.layer_norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x, mask=None):
        # Multi-Head Attention
        attn_output, _ = self.self_attention(x, x, x, attn_mask=mask)

        # Add & Norm
        x = self.layer_norm1(x + attn_output)

        # Position-wise Feedforward
        ff_output = self.feed_forward(x)

        # Add & Norm
        x = self.layer_norm2(x + ff_output)

        return x


In [None]:
# Layer normalization
"""
    Layer normalization is a technique that normalizes the activations of a
    layer across the feature dimension, helping to reduce the effect of
    internal covariate shift and improve the stability of the model.

    Here, n_features is the number of features in the input tensor,
    eps is a small value added for numerical stability, and weight and bias
    are learnable parameters for scaling and shifting the normalized output.
    In the forward pass, the mean and standard deviation are computed along
    the last dimension (which is assumed to be the feature dimension), and
    the output is normalized and then scaled and shifted using the learned
    parameters.
"""
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, n_features, eps=1e-6):
        super().__init__()
        self.n_features = n_features
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(n_features))
        self.bias = nn.Parameter(torch.zeros(n_features))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True, unbiased=False)
        norm = (x - mean) / (std + self.eps)
        # for the first time preserve the same weight with no bias
        # as model learns they would be updated
        return norm * self.weight + self.bias

In [None]:
# Positional encoding
import torch

class PositionalEncoding(torch.nn.Module):
    def __init__(self, embed_dim, max_len=512):
        """
        max_seq_len: typically set to the maximum length of any sentence in the
                     training data or a fixed value based on memory constraints
        """
        super().__init__()
        self.dropout = torch.nn.Dropout(p=0.1)
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * \
                             (-torch.log(torch.tensor(10000.0)) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


In [None]:
# Linear layer
"""
Here, the GPTLinearLayer class takes in the embedding_size (i.e., the size of the
embeddings output by the final transformer layer) and vocab_size (i.e., the size of
the vocabulary). Inside the forward method, we first flatten the batch_size and
seq_length dimensions of the input tensor using the view method. We then pass the
flattened tensor through a Linear layer (self.fc), which has output size equal to
vocab_size. Finally, we reshape the output back to the original shape of
(batch_size, seq_length, vocab_size) using the view method again.
"""
import torch
import torch.nn as nn

class GPTLinearLayer(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super(GPTLinearLayer, self).__init__()
        self.fc = nn.Linear(embedding_size, vocab_size)

    def forward(self, x):
        # x has shape (batch_size, seq_length, embedding_size)
        # Flatten the batch_size and seq_length dimensions for the linear layer
        x = x.view(-1, x.size(-1))
        # Pass through the linear layer
        x = self.fc(x)
        # Reshape back to (batch_size, seq_length, vocab_size)
        x = x.view(-1, x.size(1), self.fc.out_features)
        return x

In [None]:
# Optmizer
import torch.optim as optim

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)