In [1]:
import torch
import torch.nn as nn
import math

In [2]:
nn.Parameter(torch.ones(3))

Parameter containing:
tensor([1., 1., 1.], requires_grad=True)

In [3]:
import torch
from torch import nn

class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps:float=10**-6):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True) # (batch, seq_len, 1)
        std = x.std(dim=-1, keepdim=True) # (batch, seq_len, 1)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

# Create an instance of LayerNormalization
layer_norm = LayerNormalization(features=512)

# Create a random tensor of shape (batch_size, seq_len, hidden_size). represent a batch of 10 sequences, each of length 20, with 512 features for each token in the sequence.
x = torch.randn(10, 20, 512)

# Apply layer normalization
output = layer_norm(x)

print(output.shape)  # Should print: torch.Size([10, 20, 512])

torch.Size([10, 20, 512])


In [4]:
import torch
from torch import nn
import math

class InputEmbedding(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        # x: (batch, seq_len) -> (batch, seq_len, d_model)
        return self.embedding(x) * math.sqrt(self.d_model)

# Create an instance of InputEmbedding
input_embedding = InputEmbedding(d_model=512, vocab_size=10000)

# Create a random tensor of shape (batch_size, seq_len)
x = torch.randint(0, 10000, (10, 20))

# Apply the input embedding
output = input_embedding(x)

print(output.shape)  # Should print: torch.Size([10, 20, 512])

torch.Size([10, 20, 512])


In [6]:
output[0, 0, :10]  # Print the first 10 values of the first token of the first sequence

tensor([ -2.8158,  -2.9024,  15.4488,   4.5374,  16.8493,  47.5993,  34.5922,
        -39.0154,  28.3406,   9.8365], grad_fn=<SliceBackward0>)

In [7]:
import torch
from torch import nn

# Create an embedding layer that can handle 10 unique tokens (vocab_size=10),
# and each token is represented as a 5-dimensional vector (embedding_dim=5).
embedding = nn.Embedding(10, 5)

# Create a tensor of token IDs. In this case, we have a batch of 2 sequences,
# each of length 3. The token IDs are integers between 0 and 9.
x = torch.tensor([[1, 2, 3], [4, 5, 6]])

# Apply the embedding layer to x. Each token ID in x is replaced by its embedding.
y = embedding(x)

print(y)

tensor([[[-0.7645, -0.3030, -0.2605, -1.3264, -0.7835],
         [-1.4815, -1.0590,  0.5636,  1.1329,  0.6071],
         [ 1.1266, -0.6958, -0.4362,  0.1131,  0.1246]],

        [[ 0.6811, -0.5672,  0.0780,  1.5076,  0.4472],
         [ 2.1869,  0.1602, -0.1720, -0.5146,  1.3993],
         [-1.0243,  0.1687, -0.1505,  0.6010,  0.2788]]],
       grad_fn=<EmbeddingBackward0>)


In [8]:
print(y.shape)

torch.Size([2, 3, 5])
