In [32]:
import numpy as np
import math
import torch
import torch.nn as nn
from torch.autograd import Variable

## Positional encoding

Original positional encoding from "Attention is all you need" paper

In [98]:
d_model = 512
x = torch.rand(1,6,10)
pos = 0
def positional_encoding(tensor):
    batch,length,d_model = tensor.shape
    pe = torch.zeros(length,d_model)
    position = torch.arange(0,length).unsqueeze(1)
    div_term = torch.exp(-math.log(10000.0) * torch.arange(0, d_model, 2).float()/ d_model)
    pe[:,0::2] = torch.sin(position*div_term)
    pe[:,1::2] = torch.cos(position*div_term)
    pe = pe.unsqueeze(0)
    return pe

In [99]:
positional_encoding(x)

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  1.5783e-01,  9.8747e-01,  2.5116e-02,
           9.9968e-01,  3.9811e-03,  9.9999e-01,  6.3096e-04,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  3.1170e-01,  9.5018e-01,  5.0217e-02,
           9.9874e-01,  7.9621e-03,  9.9997e-01,  1.2619e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  4.5775e-01,  8.8908e-01,  7.5285e-02,
           9.9716e-01,  1.1943e-02,  9.9993e-01,  1.8929e-03,  1.0000e+00],
         [-7.5680e-01, -6.5364e-01,  5.9234e-01,  8.0569e-01,  1.0031e-01,
           9.9496e-01,  1.5924e-02,  9.9987e-01,  2.5238e-03,  1.0000e+00],
         [-9.5892e-01,  2.8366e-01,  7.1207e-01,  7.0211e-01,  1.2526e-01,
           9.9212e-01,  1.9904e-02,  9.9980e-01,  3.1548e-03,  1.0000e+00]]])

In [100]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout=0.3, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0., max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        #return self.dropout(x)
        return x

In [101]:
a = PositionalEncoding(10)

In [102]:
a.forward(x)

tensor([[[ 0.6268,  1.5623,  0.8246,  1.6400,  0.8334,  1.2400,  0.3692,
           1.4134,  0.0672,  1.5023],
         [ 1.1302,  1.0054,  0.6584,  1.3469,  0.4714,  1.4054,  0.8990,
           1.5506,  0.6671,  1.8118],
         [ 0.9974, -0.0263,  1.1654,  1.8849,  0.9618,  1.0300,  0.0296,
           1.2786,  0.7354,  1.3790],
         [ 0.7998, -0.6143,  1.1517,  1.2161,  0.2881,  1.4628,  0.5263,
           1.3204,  0.8208,  1.7594],
         [ 0.1677, -0.0887,  0.7773,  1.1589,  0.6773,  1.7156,  0.9473,
           1.7304,  0.5003,  1.2489],
         [-0.1962,  1.1408,  0.9834,  1.3166,  0.9852,  1.7799,  0.6950,
           1.2468,  0.2282,  1.2666]]])

## Label smoothing

In [None]:
Lower label 1 to 0.1 prob

## Attention 