#### neftune
  - 核心思想是 在embedding 之后 加上一个随机扰动，在传入 transformer block 
  - 和dropout 一样，只是用在 训练阶段，推理阶段不使用

In [1]:
import torch
from torch import nn

vocab_size = 20; embedding_dim = 128
my_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

my_embedding

Embedding(20, 128)

In [2]:
random_input = torch.randint(low=0, high=20, size=(2, 5))  # bs=2,seq_len=5

In [3]:
origin_embedding_output = my_embedding(random_input)
origin_embedding_output.shape

torch.Size([2, 5, 128])

In [None]:
def neftune_post_forward_hook(module, input, output):
    """
    Implements the NEFTune forward pass for the model using forward hooks. Note this works only for torch.nn.Embedding
    layers. This method is slightly adapted from the original source code that can be found here:
    https://github.com/neelsjain/NEFTune Simply add it to your model as follows:
    ```python
    model = ...
    model.embed_tokens.neftune_noise_alpha = 0.1
    model.embed_tokens.register_forward_hook(neftune_post_forward_hook)
    ```
    Args:
        module (`torch.nn.Module`):
            The embedding module where the hook is attached. Note that you need to set `module.neftune_noise_alpha` to
            the desired noise alpha value.
        input (`torch.Tensor`):
            The input tensor to the model.
        output (`torch.Tensor`):
            The output tensor of the model (i.e. the embeddings).
    """
    # if module.training:
    dims = torch.tensor(output.size(1) * output.size(2))  # size = (seq_len, embedding_dim)
    mag_norm = module.neftune_noise_alpha / torch.sqrt(dims)
    # -mag_norm 到 mag_norm 之间的均匀分布
    output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm)
    return output

my_embedding.neftune_noise_alpha = 0.2
# Module == my_embeddding 执行完成一次forward后，执行钩子内容
# 之所以赋值给handle_neftune 仅仅是为了可以卸载钩子
handle_neftune = my_embedding.register_forward_hook(hook=neftune_post_forward_hook)



In [None]:
neftune_embedding_output = my_embedding(random_input)
neftune_embedding_output

In [6]:
neftune_embedding_output.shape

torch.Size([2, 5, 128])

In [8]:
torch.allclose(input=neftune_embedding_output, other=origin_embedding_output)

False

In [None]:
# 自己按照上面的变换，实现neftune 
# my_embedding.neftune_noise_alpha = 0.2 seq_len = 5, embedding_dim = 128
# 计算得到 mag_norm
0.2 / torch.sqrt(torch.tensor(data=5.0 * 128.0))


tensor(0.0079)

In [None]:
test_noise = neftune_embedding_output - origin_embedding_output
torch.max(test_noise), torch.min(test_noise)
# 两者只差的最大，最小值正好是

(tensor(0.0079, grad_fn=<MaxBackward1>),
 tensor(-0.0079, grad_fn=<MinBackward1>))

In [11]:
# 卸载钩子
handle_neftune.remove()
del handle_neftune

In [13]:
neftune_remove_embedding_output = my_embedding(random_input)
neftune_remove_embedding_output.shape

torch.Size([2, 5, 128])

In [14]:
torch.allclose(neftune_remove_embedding_output, origin_embedding_output)

True