In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

## Embedding的计算过程

In [2]:
embedding = nn.Embedding(10, 3)
embedding.weight

Parameter containing:
tensor([[-2.0443,  0.3642,  1.2289],
        [ 0.5833,  1.1086,  1.5713],
        [-3.0481, -0.5672, -1.6671],
        [ 0.1426,  0.0943,  1.7851],
        [ 1.0537, -0.0071,  0.9708],
        [ 0.6440,  1.5811,  1.0673],
        [ 1.4431,  0.1367, -1.0715],
        [-2.2037, -0.4144, -0.1168],
        [ 0.3720,  0.6556, -0.5180],
        [-0.4696,  0.7806,  0.8387]], requires_grad=True)

In [4]:
input = torch.tensor([[1, 2, 4, 5], [4, 3, 2, 9]], dtype=torch.float)
print(input.dtype)

torch.float32


In [5]:
embedding(input)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [6]:
input = torch.Tensor([[1, 2, 4, 5], [4, 3, 2, 9]])
print(input.dtype)

torch.float32


In [7]:
embedding(input)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [8]:
input = torch.IntTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
print(input.dtype)

torch.int32


In [9]:
embedding(input)

tensor([[[ 0.5833,  1.1086,  1.5713],
         [-3.0481, -0.5672, -1.6671],
         [ 1.0537, -0.0071,  0.9708],
         [ 0.6440,  1.5811,  1.0673]],

        [[ 1.0537, -0.0071,  0.9708],
         [ 0.1426,  0.0943,  1.7851],
         [-3.0481, -0.5672, -1.6671],
         [-0.4696,  0.7806,  0.8387]]], grad_fn=<EmbeddingBackward0>)

In [10]:
# num_classes == vocab_size
input_one_hot = F.one_hot(input, num_classes=10)
print(input_one_hot)
input_one_hot

RuntimeError: one_hot is only applicable to index tensor.

In [11]:
input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
print(input.dtype)

torch.int64


In [14]:
input_one_hot = F.one_hot(input, num_classes=10)
print(input_one_hot)

tensor([[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]])


In [15]:
torch.matmul(input_one_hot.type(torch.float), embedding.weight)

tensor([[[ 0.5833,  1.1086,  1.5713],
         [-3.0481, -0.5672, -1.6671],
         [ 1.0537, -0.0071,  0.9708],
         [ 0.6440,  1.5811,  1.0673]],

        [[ 1.0537, -0.0071,  0.9708],
         [ 0.1426,  0.0943,  1.7851],
         [-3.0481, -0.5672, -1.6671],
         [-0.4696,  0.7806,  0.8387]]], grad_fn=<UnsafeViewBackward0>)

### max_norm

When max_norm is not None, Embedding’s forward method will modify the weight tensor in-place. Since tensors needed for gradient computations cannot be modified in-place, performing a differentiable operation on Embedding.weight before calling Embedding’s forward method requires cloning Embedding.weight when max_norm is not None. For example:

In [16]:
embedding = nn.Embedding(3, 5, )
print(embedding.weight.mean())
print(embedding.weight.std())
print(embedding.weight)
torch.norm(embedding.weight, dim=1)

tensor(0.3520, grad_fn=<MeanBackward0>)
tensor(0.8388, grad_fn=<StdBackward0>)
Parameter containing:
tensor([[-0.0180, -0.7494, -0.6305,  1.0482,  0.6363],
        [-0.9310,  1.0206,  1.7331,  0.1618,  1.1863],
        [ 0.7504,  0.6446, -0.6428, -0.1489,  1.2188]], requires_grad=True)


tensor([1.5694, 2.5191, 1.7028], grad_fn=<LinalgVectorNormBackward0>)

In [17]:
torch.norm(torch.tensor([[-0.0180, -0.7494, -0.6305, 1.0482, 0.6363]]))

tensor(1.5694)

In [19]:
inputs = torch.tensor([0, 1, 2])
print(inputs.shape)
outputs = embedding(inputs)
outputs

torch.Size([3])


tensor([[-0.0180, -0.7494, -0.6305,  1.0482,  0.6363],
        [-0.9310,  1.0206,  1.7331,  0.1618,  1.1863],
        [ 0.7504,  0.6446, -0.6428, -0.1489,  1.2188]],
       grad_fn=<EmbeddingBackward0>)

In [20]:
torch.norm(embedding.weight, dim=1)

tensor([1.5694, 2.5191, 1.7028], grad_fn=<LinalgVectorNormBackward0>)

### max_norm = True

https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html

In [27]:
# max_norm == True ==> max_norm == 1
embedding = nn.Embedding(3, 5, max_norm=True)
print(embedding.weight.mean())
print(embedding.weight.std())
print(embedding.weight)
torch.norm(embedding.weight, dim=1)

tensor(0.0184, grad_fn=<MeanBackward0>)
tensor(0.7801, grad_fn=<StdBackward0>)
Parameter containing:
tensor([[ 0.4448, -1.1583,  0.2906,  0.1439,  0.4156],
        [ 1.9763, -0.9256, -0.7320,  0.2111,  0.5909],
        [-0.7123, -0.2092, -0.5203,  0.0213,  0.4389]], requires_grad=True)


tensor([1.3481, 2.3858, 1.0074], grad_fn=<LinalgVectorNormBackward0>)

In [22]:
inputs = torch.tensor([0, 1, 2])
print(inputs.shape)
outputs = embedding(inputs)
outputs

torch.Size([3])


tensor([[-7.1905e-01, -3.7812e-01, -4.3993e-01,  1.5299e-01,  3.5078e-01],
        [ 1.5958e-01,  7.8875e-01, -3.5585e-01, -4.2038e-01,  2.2149e-01],
        [ 8.9258e-02,  7.0943e-04, -4.6283e-01,  8.6112e-01, -1.9052e-01]],
       grad_fn=<EmbeddingBackward0>)

In [23]:
torch.norm(outputs, dim=1)

tensor([1.0000, 1.0000, 1.0000], grad_fn=<LinalgVectorNormBackward0>)

In [24]:
embedding.weight

Parameter containing:
tensor([[-7.1905e-01, -3.7812e-01, -4.3993e-01,  1.5299e-01,  3.5078e-01],
        [ 1.5958e-01,  7.8875e-01, -3.5585e-01, -4.2038e-01,  2.2149e-01],
        [ 8.9258e-02,  7.0943e-04, -4.6283e-01,  8.6112e-01, -1.9052e-01]],
       requires_grad=True)

In [26]:
embedding.from_pretrained


<bound method Embedding.from_pretrained of <class 'torch.nn.modules.sparse.Embedding'>>