In [19]:
import torch
import torch.nn as nn
import math
from typing import Optional
import tiktoken

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = tiktoken.get_encoding("gpt2")

In [20]:
class InputEmbedding(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        d_model: int,
        device: Optional[torch.device] = device,
        tokenizer=tokenizer,
    ):
        super().__init__()
        self.device = device
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=d_model, device=device
        )
        self.d_model = d_model
        self.tokenizer = tokenizer

    def forward(self, input_text: str):
        """
        Args:
            input_text: str, the input text
        Returns:
            tensor, the input embeddings

        This is consistent with the original paper which uses a square root of the
        dimension of the model as the scaling factor for the input embeddings (page 5,
        section 3.4).
        """
        token_ids = self.tokenizer.encode(input_text)
        return self.embedding(
            torch.stack([torch.tensor(token_ids, device=self.device)], dim=0)
        ) * math.sqrt(self.d_model)

In [21]:
embedding = InputEmbedding(vocab_size=50257, d_model=512, device=device)
embedding(input_text="Hello, world!")

tensor([[[ 28.6017, -12.7955,  15.7570,  ..., -13.6463, -14.2052,  39.2336],
         [  8.8600,  25.1734,  -1.8054,  ...,   7.3967,  -5.4317,   2.3652],
         [  7.2131,   0.7212,  21.4795,  ..., -13.5974,  -0.4842, -16.5771],
         [  2.2655,  11.6492,  -1.8845,  ..., -57.5402,  29.0771,  34.8831]]],
       grad_fn=<MulBackward0>)