# Tests & trials

In [4]:
import torch
import torch.nn as nn

## Understanding functions

### nn.Linear()

In [5]:
m = nn.Linear(20, 30) # torch.nn.Linear(in_features, out_features)
input = torch.randn(128, 20) # 128 vecs, each of length 20
output = m(input)
print("input shape:", input.shape)
print("output shape:", output.size())

input shape: torch.Size([128, 20])
output shape: torch.Size([128, 30])


### nn.Embedding()

* num_embeddings (int) – size of the dictionary of embeddings
* embedding_dim (int) – the size of each embedding vector

In [6]:
# an Embedding module containing 10 tensors of size 3
embedding = nn.Embedding(10, 3) # torch.nn.Embedding(num_embeddings, embedding_dim, ...)
# a batch of 2 samples of 6 indices each
input = torch.LongTensor([[1,2,4,5,7,7],[4,3,2,9,2,2]])
output = embedding(input)
print("input shape:", input.shape)
print("output shape:", output.size())
print("input:", input)
print("output:", output)

input shape: torch.Size([2, 6])
output shape: torch.Size([2, 6, 3])
input: tensor([[1, 2, 4, 5, 7, 7],
        [4, 3, 2, 9, 2, 2]])
output: tensor([[[ 0.2720, -0.6043,  0.1841],
         [-0.9251,  1.0612, -0.0828],
         [-0.3585,  0.8105, -0.3548],
         [ 0.4088, -0.8186,  1.6529],
         [ 0.4062,  1.9184,  1.1697],
         [ 0.4062,  1.9184,  1.1697]],

        [[-0.3585,  0.8105, -0.3548],
         [-0.6742, -0.0118,  0.5788],
         [-0.9251,  1.0612, -0.0828],
         [-1.4209,  0.0125, -0.7004],
         [-0.9251,  1.0612, -0.0828],
         [-0.9251,  1.0612, -0.0828]]], grad_fn=<EmbeddingBackward>)


In [7]:
# example with padding_idx
embedding = nn.Embedding(10, 3, padding_idx=0)
input = torch.LongTensor([[0,2,0,5]])
print("input:", input)
print("output:", embedding(input))

input: tensor([[0, 2, 0, 5]])
output: tensor([[[ 0.0000,  0.0000,  0.0000],
         [-0.4685, -1.4625,  1.0954],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.6073, -0.6267,  0.5618]]], grad_fn=<EmbeddingBackward>)


### lambda

In [10]:
x = lambda a, b : a * b
print(x(5, 6))

30


### assert

In [13]:
batch_first = False
src_key_padding_mask = None

In [14]:
assert not batch_first # AssertionError when batch_first=True: not True = False 
assert src_key_padding_mask is None

### argparse.ArgumentParser()

create a separate new file hello.py in folder my_files/


[link: TowardsDataScience](https://towardsdatascience.com/a-simple-guide-to-command-line-arguments-with-argparse-6824c30ab1c3)

In [15]:
# import argparse
# parser = argparse.ArgumentParser()
# parser.add_argument('--name', type=str, required=True)
# args = parser.parse_args()
# print('Hello,', args.name)

in command line:

1. cd my_files/
2. python hello.py --name Ugne


output:

Hello, Ugne

so I kind of tell python to execute file hello.py and also give the required argument --name: "Ugne"

it executes the file so it checks if I gave the necessary argument and then does what the file tells: print('Hello,', args.name)

### ._ _ dict _ _.

In [16]:
def func():
    pass

func.temp = 1

print(func.__dict__)

{'temp': 1}


## Understanding train.py

In [17]:
# train(prior, 
#     criterion, 
#     encoder_generator, 
#     y_encoder_generator=y_encoder_generator, 
#     pos_encoder_generator=pos_encoder_generator, 
#     **args.__dict__)

In [18]:
%%script echo skipping

def train(priordataloader_class, 
          criterion, 
          encoder_generator, 
          emsize=200, nhid=200, nlayers=6, nhead=2, dropout=0.0,
          epochs=10, steps_per_epoch=100, batch_size=200, bptt=10, lr=None, weight_decay=0.0, warmup_epochs=10, input_normalization=False,
          y_encoder_generator=None, pos_encoder_generator=None, decoder=None, extra_prior_kwargs_dict={}, scheduler=get_cosine_schedule_with_warmup,
          load_weights_from_this_state_dict=None, validation_period=10, single_eval_pos_gen=None, bptt_extra_samples=None, gpu_device='cuda:0',
          aggregate_k_gradients=1, verbose=True, style_encoder_generator=None, epoch_callback=None,
          initializer=None, initialize_with_model=None, train_mixed_precision=False, efficient_eval_masking=True, **model_extra_args
          ):

    def eval_pos_seq_len_sampler():
        single_eval_pos = single_eval_pos_gen()
        if bptt_extra_samples:
            return single_eval_pos, single_eval_pos + bptt_extra_samples
        else:
            return single_eval_pos, bptt

    # haven't found this function priordataloader_class() in other docs - where is it defined?        
    dl = priordataloader_class(num_steps=steps_per_epoch, 
                               batch_size=batch_size, 
                               eval_pos_seq_len_sampler=eval_pos_seq_len_sampler, 
                               seq_len_maximum=bptt+(bptt_extra_samples if bptt_extra_samples else 0), 
                               device=device, 
                               **extra_prior_kwargs_dict)

    encoder = encoder_generator(dl.num_features, emsize)
    #style_def = dl.get_test_batch()[0][0] # the style in batch of the form ((style, x, y), target, single_eval_pos)
    style_def = None
    #print(f'Style definition of first 3 examples: {style_def[:3] if style_def is not None else None}')
    style_encoder = style_encoder_generator(style_def.shape[1], emsize) if (style_def is not None) else None
    if isinstance(criterion, nn.GaussianNLLLoss):
        n_out = 2
    elif isinstance(criterion, nn.CrossEntropyLoss):
        n_out = criterion.weight.shape[0]
    else:
        n_out = 1

    model = TransformerModel(encoder, n_out, emsize, nhead, nhid, nlayers, dropout, style_encoder=style_encoder,
                             y_encoder=y_encoder_generator(1, emsize), input_normalization=input_normalization,
                             pos_encoder=(pos_encoder_generator or positional_encodings.NoPositionalEncoding)(emsize, bptt*2),
                             decoder=decoder, init_method=initializer, efficient_eval_masking=efficient_eval_masking, **model_extra_args
                             )

skipping


In [22]:
%%script echo skipping

style_encoder_generator = None

style_def = dl.get_test_batch()[0][0] # the style in batch of the form ((style, x, y), target, single_eval_pos)
style_def = None
#print(f'Style definition of first 3 examples: {style_def[:3] if style_def is not None else None}')
style_encoder = style_encoder_generator(style_def.shape[1], emsize) if (style_def is not None) else None

skipping


## encoders

1) they import encoders so the content of encoders.py file

In [24]:
%%script echo skipping
import tabpfn.encoders as encoders

skipping


in file encoders.py they have this:

In [25]:
%%script echo skipping

Linear = nn.Linear

class Linear(nn.Linear):
    def __init__(self, num_features, emsize, replace_nan_by_zero=False):
        super().__init__(num_features, emsize)
        self.num_features = num_features
        self.emsize = emsize
        self.replace_nan_by_zero = replace_nan_by_zero

    def forward(self, x):
        if self.replace_nan_by_zero:
            x = torch.nan_to_num(x, nan=0.0)
        return super().forward(x)

    def __setstate__(self, state):
        super().__setstate__(state)
        self.__dict__.setdefault('replace_nan_by_zero', True)

skipping


In [26]:
%%script echo skipping

MLP = lambda num_features, emsize: nn.Sequential(nn.Linear(num_features+1,emsize*2),
                                                 nn.ReLU(),
                                                 nn.Linear(emsize*2,emsize))

skipping


In [27]:
%%script echo skipping

class _PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model
        self.device_test_tensor = nn.Parameter(torch.tensor(1.))

    def forward(self, x):# T x B x num_features
        assert self.d_model % x.shape[-1]*2 == 0
        d_per_feature = self.d_model // x.shape[-1]
        pe = torch.zeros(*x.shape, d_per_feature, device=self.device_test_tensor.device)
        #position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        interval_size = 10
        div_term = (1./interval_size) * 2*math.pi*torch.exp(torch.arange(0, d_per_feature, 2, device=self.device_test_tensor.device).float()*math.log(math.sqrt(2)))
        #print(div_term/2/math.pi)
        pe[..., 0::2] = torch.sin(x.unsqueeze(-1) * div_term)
        pe[..., 1::2] = torch.cos(x.unsqueeze(-1) * div_term)
        return self.dropout(pe).view(x.shape[0],x.shape[1],self.d_model)


Positional = lambda _, emsize: _PositionalEncoding(d_model=emsize)

skipping


2) with this we're able to set the encoder and y_encoder to specific values (done through command line as I understand?):

* --encoder 'linear' / 'mlp' / 'positional'
* --y_encoder 'linear' / 'mlp' / 'positional'

In [28]:
%%script echo skipping

def _parse_args(config_parser, parser):
    # Do we have a config file to parse?
    args_config, remaining = config_parser.parse_known_args()
    if args_config.config:
        with open(args_config.config, 'r') as f:
            cfg = yaml.safe_load(f)
            parser.set_defaults(**cfg)

    # The main arg parser parses the rest of the args, the usual
    # defaults will have been overridden if config file specified.
    args = parser.parse_args(remaining)

    # Cache the args as a text string to save them in the output dir later
    args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
    return args, args_text

if __name__ == '__main__':
    config_parser = argparse.ArgumentParser(description='Only used as a first parser for the config file path.')
    config_parser.add_argument('--config')
    parser = argparse.ArgumentParser()
    parser.add_argument('--encoder', default='linear', type=str, help='Specify depending on the prior.')
    parser.add_argument('--y_encoder', default='linear', type=str, help='Specify depending on the prior. You should specify this if you do not fuse x and y.')
    parser.add_argument('--pos_encoder', default='none', type=str, help='Specify depending on the prior.')

    args, _ = _parse_args(config_parser, parser)

    if args.nhid is None:
        args.nhid = 2*args.emsize

    encoder = args.__dict__.pop('encoder') # sets encoder to the value of dictionary element 'encoder'
    y_encoder = args.__dict__.pop('y_encoder') # sets y_encoder to the value of dictionary element 'y_encoder'

    
    def get_encoder_generator(encoder):
        if encoder == 'linear':
            encoder_generator = encoders.Linear
        elif encoder == 'mlp':
            encoder_generator = encoders.MLP
        elif encoder == 'positional':
            encoder_generator = encoders.Positional
        else:
            raise NotImplementedError(f'A {encoder} encoder is not valid.')
        return encoder_generator

    encoder_generator = get_encoder_generator(encoder)
    y_encoder_generator = get_encoder_generator(y_encoder)

skipping


# Understanding encoders.py

### class Linear

* encodes (linearly) all datapoint vectors into new vectors of length = emsize
* replaces NaN by 0

In [29]:
class Linear(nn.Linear):

    def __init__(self, num_features, emsize, replace_nan_by_zero=False):
        super().__init__(num_features, emsize)
        self.num_features = num_features
        self.emsize = emsize
        self.replace_nan_by_zero = replace_nan_by_zero

    def forward(self, x):
        if self.replace_nan_by_zero:
            x = torch.nan_to_num(x, nan=0.0)
        return super().forward(x)

In [31]:
torch.manual_seed(1)
num_features = 20 # no. of features - no default, depends on the prior, see train.py
em_size = 512 # default, see train.py
batch = 1000 # default, see train.py
x = torch.randn(batch, num_features) # 128 vecs, each of length 20 (so each datapoint has 20 features)

encoder_ln = Linear(num_features, em_size)
output_ln = encoder_ln.forward(x)

print("x (input) shape:", x.shape)
print("output shape:", output_ln.size())
print(output_ln)

x (input) shape: torch.Size([1000, 20])
output shape: torch.Size([1000, 512])
tensor([[ 0.0499,  0.3734, -0.9258,  ...,  0.1520,  0.6031,  0.5477],
        [-0.4581, -0.3672,  0.7149,  ...,  0.6977, -0.3434, -0.8434],
        [ 0.1711, -0.6576, -0.3224,  ..., -0.9545, -0.2267,  0.0727],
        ...,
        [-0.8495,  0.9168, -0.2270,  ...,  0.1044,  0.6164,  1.6906],
        [-0.0899, -0.5649,  1.0796,  ...,  0.4228, -0.5281, -0.0314],
        [ 0.5822,  0.6002, -0.1890,  ..., -0.3911, -0.4425,  0.2561]],
       grad_fn=<AddmmBackward>)


### class StyleEncoder

encodes numerical features (all features at once)

* takes a batch of datapoints and looks at all features - so looks at datapoint vectors of length = num_hyperparameters
* encodes (linearly) all datapoint vectors into new vectors of length = em_size

essentially does the same as class Linear which additionally encodes NaN to 0

In [32]:
class StyleEncoder(nn.Module):
    def __init__(self, num_hyperparameters, em_size):
        super().__init__()
        self.em_size = em_size
        self.embedding = nn.Linear(num_hyperparameters, self.em_size)

    def forward(self, hyperparameters):  
        return self.embedding(hyperparameters)

In [33]:
torch.manual_seed(1)
num_hyperparameters = 20 # no. of features - no default, depends on the prior, see train.py
em_size = 512 # default, see train.py
batch = 1000 # default, see train.py
hyperparameters = torch.randn(batch, num_hyperparameters) 

encoder_st = StyleEncoder(num_hyperparameters, em_size)
output_st = encoder_st.forward(hyperparameters)

print("x (input) shape:", hyperparameters.shape)
print("output shape:", output_st.size())
print(output_st)

x (input) shape: torch.Size([1000, 20])
output shape: torch.Size([1000, 512])
tensor([[ 0.0499,  0.3734, -0.9258,  ...,  0.1520,  0.6031,  0.5477],
        [-0.4581, -0.3672,  0.7149,  ...,  0.6977, -0.3434, -0.8434],
        [ 0.1711, -0.6576, -0.3224,  ..., -0.9545, -0.2267,  0.0727],
        ...,
        [-0.8495,  0.9168, -0.2270,  ...,  0.1044,  0.6164,  1.6906],
        [-0.0899, -0.5649,  1.0796,  ...,  0.4228, -0.5281, -0.0314],
        [ 0.5822,  0.6002, -0.1890,  ..., -0.3911, -0.4425,  0.2561]],
       grad_fn=<AddmmBackward>)


### class StyleEmbEncoder

encodes categorical features (one feature at a time)

* takes a batch of datapoints and looks at one feature (assert num_hyperparameters == 1)
* encodes all possible values of this feature into vectors of length = em_size
* note: max number of distinct values that one feature can get is set to num_embeddings=100 (as I understand)

pvz: feature f_1: clothing_size={S,M,L}

so clothing_size has 3 classes and each class will get its own unique vector of length = em_size

we'll have 3 distinct vecs: one for S, one for M and one for L

In [34]:
class StyleEmbEncoder(nn.Module):
    def __init__(self, num_hyperparameters, em_size, num_embeddings=100):
        super().__init__()
        assert num_hyperparameters == 1
        self.em_size = em_size
        self.embedding = nn.Embedding(num_embeddings, self.em_size)

    def forward(self, hyperparameters): 
        return self.embedding(hyperparameters.squeeze(1))

In [35]:
torch.randint(0, 10, (8, 1))


tensor([[5],
        [0],
        [5],
        [9],
        [2],
        [3],
        [0],
        [3]])

In [36]:
torch.manual_seed(1)
num_embeddings = 100 # given in the initialization of the class
num_hyperparameters_1 = 1 # no. of features - no default, depends on the prior, see train.py
em_size = 512 # default, see train.py
batch = 1000 # default, see train.py
hyperparameters = torch.randint(0, num_embeddings, (batch, num_hyperparameters_1))

encoder_stemb = StyleEmbEncoder(num_hyperparameters_1, em_size)
output_stemb = encoder_stemb.forward(hyperparameters)

print("x (input) shape:", hyperparameters.shape)
print("output shape:", output_stemb.size())
print(output_stemb)

x (input) shape: torch.Size([1000, 1])
output shape: torch.Size([1000, 512])
tensor([[-1.6819,  2.1511, -0.3536,  ..., -0.5102,  1.3916, -0.1969],
        [ 0.4755, -0.1734,  0.2474,  ..., -0.7203,  0.9995, -1.2702],
        [ 0.2038,  0.9803, -1.2191,  ..., -0.7268,  1.9840, -1.0883],
        ...,
        [-2.3043,  0.1429, -1.0265,  ...,  0.7998,  0.8934, -0.7810],
        [-1.2654,  0.0638,  0.6347,  ...,  0.0913, -0.4807,  0.1304],
        [ 0.0595, -1.0513, -1.8355,  ..., -0.0785,  1.0079, -0.2281]],
       grad_fn=<EmbeddingBackward>)


ok, so it works like this:

StyleEmbEncoder():

* num_hyperparameters = 1 - so we take one feature (say datapoint has 5 features, so we take first feature, which is e.g. gender)
* this hyperparameter

# Understanding transformer.py

There's a class `TransformerModel()`

In [37]:
%%script echo skipping

class TransformerModel(nn.Module):
    def __init__(self, encoder, n_out, ninp, nhead, nhid, nlayers, dropout=0.0, style_encoder=None, y_encoder=None,
                 pos_encoder=None, decoder=None, input_normalization=False, init_method=None, pre_norm=False,
                 activation='gelu', recompute_attn=False, num_global_att_tokens=0, full_attention=False,
                 all_layers_same_init=False, efficient_eval_masking=True):
        super().__init__()
        self.model_type = 'Transformer'
        encoder_layer_creator = lambda: TransformerEncoderLayer(ninp, nhead, nhid, dropout, activation=activation,
                                                                pre_norm=pre_norm, recompute_attn=recompute_attn)
        self.transformer_encoder = TransformerEncoder(encoder_layer_creator(), nlayers)\
            if all_layers_same_init else TransformerEncoderDiffInit(encoder_layer_creator, nlayers)
        self.encoder = encoder
        self.y_encoder = y_encoder
        self.pos_encoder = pos_encoder

skipping


within this class there's a function `forward()` which uses the argument `encoder` which is an argument for this class

In [38]:
%%script echo skipping

def forward(self, src, src_mask=None, single_eval_pos=None):
        assert isinstance(src, tuple), 'inputs (src) have to be given as (x,y) or (style,x,y) tuple'

        if len(src) == 2: # (x,y) and no style
            src = (None,) + src

        style_src, x_src, y_src = src
        x_src = self.encoder(x_src)
        y_src = self.y_encoder(y_src.unsqueeze(-1) if len(y_src.shape) < len(x_src.shape) else y_src)
        style_src = self.style_encoder(style_src).unsqueeze(0) if self.style_encoder else \
            torch.tensor([], device=x_src.device)
        global_src = torch.tensor([], device=x_src.device) if self.global_att_embeddings is None else \
            self.global_att_embeddings.weight.unsqueeze(1).repeat(1, x_src.shape[1], 1)

skipping


## Understanding layer.py

In [40]:
torch.manual_seed(1)
data = torch.rand(10, 32, 512)
query, key, value = data, data, data 

embed_dim = 512
num_heads = 4

In [41]:
multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
attn_output, attn_output_weights = multihead_attn(query, key, value)

### class TransformerEncoderLayer()

#### full

In [42]:
from functools import partial

from torch import nn
import torch
from torch.nn.modules.transformer import _get_activation_fn, Module, Tensor, Optional, MultiheadAttention, Linear, Dropout, LayerNorm
from torch.utils.checkpoint import checkpoint

# added by Ugne (before it showed error: F is not defined)
from torch.nn import functional as F

# full
class TransformerEncoderLayer(Module):
    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
    This standard encoder layer is based on the paper "Attention Is All You Need".
    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
    in a different way during application.

    Args:
        d_model: the number of expected features in the input (required).
        nhead: the number of heads in the multiheadattention models (required).
        dim_feedforward: the dimension of the feedforward network model (default=2048).
        dropout: the dropout value (default=0.1).
        activation: the activation function of intermediate layer, relu or gelu (default=relu).
        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
        batch_first: If ``True``, then the input and output tensors are provided
            as (batch, seq, feature). Default: ``False``.

    Examples::
        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        >>> src = torch.rand(10, 32, 512)
        >>> out = encoder_layer(src)

    Alternatively, when ``batch_first`` is ``True``:
        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
        >>> src = torch.rand(32, 10, 512)
        >>> out = encoder_layer(src)
    """
    __constants__ = ['batch_first']

    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
                 layer_norm_eps=1e-5, batch_first=False, pre_norm=False,
                 device=None, dtype=None, recompute_attn=False) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
                                            **factory_kwargs)
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)

        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.pre_norm = pre_norm
        self.recompute_attn = recompute_attn

        self.activation = _get_activation_fn(activation)

    def __setstate__(self, state): # not sure what it does
        if 'activation' not in state:
            state['activation'] = F.relu
        super().__setstate__(state)

    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        r"""Pass the input through the encoder layer.

        Args:
            src: the sequence to the encoder layer (required).
            src_mask: the mask for the src sequence (optional).
            src_key_padding_mask: the mask for the src keys per batch (optional).

        Shape:
            see the docs in Transformer class.
        """
        if self.pre_norm: # NOT RUN: pre_norm=False by default and is not changed in model=TransformerModel() in train.py
            src_ = self.norm1(src)
            #print("not run")
        else: # this gets RUN
            src_ = src
        if isinstance(src_mask, tuple): # NOT RUN - AssertionError 
            # global attention setup
            assert not self.self_attn.batch_first # AssertionError when batch_first=True: not True = False  --> so batch_first must be False (and it is - default False is not changed in model=TransformerModel() in train.py)
            assert src_key_padding_mask is None # AssertionError when src_key_padding_mask=None --> so src_key_padding_mask must be not None (but it is None - default None is not changed)
            
            # I think this is not run as we get AssertionError: default src_key_padding_mask=None is not changed
            # so we actually do what's in else (elif also gets AssertionError fot the same reason)
            
            global_src_mask, trainset_src_mask, valset_src_mask = src_mask

            num_global_tokens = global_src_mask.shape[0]
            num_train_tokens = trainset_src_mask.shape[0]

            global_tokens_src = src_[:num_global_tokens]
            train_tokens_src = src_[num_global_tokens:num_global_tokens+num_train_tokens]
            global_and_train_tokens_src = src_[:num_global_tokens+num_train_tokens]
            eval_tokens_src = src_[num_global_tokens+num_train_tokens:]


            attn = partial(checkpoint, self.self_attn) if self.recompute_attn else self.self_attn

            global_tokens_src2 = attn(global_tokens_src, global_and_train_tokens_src, global_and_train_tokens_src, None, True, global_src_mask)[0]
            train_tokens_src2 = attn(train_tokens_src, global_tokens_src, global_tokens_src, None, True, trainset_src_mask)[0]
            eval_tokens_src2 = attn(eval_tokens_src, src_, src_,
                                    None, True, valset_src_mask)[0]

            src2 = torch.cat([global_tokens_src2, train_tokens_src2, eval_tokens_src2], dim=0)

        elif isinstance(src_mask, int): # NOT RUN - AssertionError 
            assert src_key_padding_mask is None # AssertionError when src_key_padding_mask=None --> so src_key_padding_mask must be not None (but it is None - default None is not changed)
            single_eval_position = src_mask
            src_left = self.self_attn(src_[:single_eval_position], src_[:single_eval_position], src_[:single_eval_position])[0]
            src_right = self.self_attn(src_[single_eval_position:], src_[:single_eval_position], src_[:single_eval_position])[0]
            src2 = torch.cat([src_left, src_right], dim=0)
        else: # this gets RUN 
            if self.recompute_attn: # recompute_attn=False by default, and is not changed in model=TransformerModel() in train.py)
                src2 = checkpoint(self.self_attn, src_, src_, src_, src_key_padding_mask, True, src_mask)[0]
            else: # so we actually do this part
                src2 = self.self_attn(src_, src_, src_, attn_mask=src_mask,
                                      key_padding_mask=src_key_padding_mask)[0]
        src = src + self.dropout1(src2)
        if not self.pre_norm: # this gets RUN: pre_norm=False so not False is True
            src = self.norm1(src)

        if self.pre_norm: # NOT RUN: pre_norm=False
            src_ = self.norm2(src)
        else: # this gets RUN
            src_ = src
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src_))))
        src = src + self.dropout2(src2)

        if not self.pre_norm: # this gets RUN: pre_norm=False so not False is True
            src = self.norm2(src)
        return src


In [43]:
torch.manual_seed(1)
src = torch.rand(10, 32, 512)

encoder_layer = TransformerEncoderLayer(d_model=512, nhead=4)
out_full = encoder_layer(src)

print(src[0,0,0:3])
print(out_full.shape)
print(out_full[0,0,0:3])
# tensor([ 1.2991, -0.8532, -0.0118]) # when I run with full class definition

tensor([0.7576, 0.2793, 0.4031])
torch.Size([10, 32, 512])
tensor([ 1.2991, -0.8532, -0.0118], grad_fn=<SliceBackward>)


#### commented out

In [44]:
%%script echo skipping
from functools import partial

from torch import nn
import torch
from torch.nn.modules.transformer import _get_activation_fn, Module, Tensor, Optional, MultiheadAttention, Linear, Dropout, LayerNorm
from torch.utils.checkpoint import checkpoint

# added by Ugne (before it showed error: F is not defined)
from torch.nn import functional as F

# commented out what's not run
class TransformerEncoderLayer(Module):
    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
    This standard encoder layer is based on the paper "Attention Is All You Need".
    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
    in a different way during application.

    Args:
        d_model: the number of expected features in the input (required).
        nhead: the number of heads in the multiheadattention models (required).
        dim_feedforward: the dimension of the feedforward network model (default=2048).
        dropout: the dropout value (default=0.1).
        activation: the activation function of intermediate layer, relu or gelu (default=relu).
        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
        batch_first: If ``True``, then the input and output tensors are provided
            as (batch, seq, feature). Default: ``False``.

    Examples::
        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        >>> src = torch.rand(10, 32, 512)
        >>> out = encoder_layer(src)

    Alternatively, when ``batch_first`` is ``True``:
        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
        >>> src = torch.rand(32, 10, 512)
        >>> out = encoder_layer(src)
    """
    __constants__ = ['batch_first']

    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
                 layer_norm_eps=1e-5, batch_first=False, pre_norm=False,
                 device=None, dtype=None, recompute_attn=False) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
                                            **factory_kwargs)
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)

        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.pre_norm = pre_norm
        self.recompute_attn = recompute_attn

        self.activation = _get_activation_fn(activation)

    # def __setstate__(self, state): # not sure what it does
    #     if 'activation' not in state:
    #         state['activation'] = F.relu
    #     super().__setstate__(state)

    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        r"""Pass the input through the encoder layer.

        Args:
            src: the sequence to the encoder layer (required).
            src_mask: the mask for the src sequence (optional).
            src_key_padding_mask: the mask for the src keys per batch (optional).

        Shape:
            see the docs in Transformer class.
        """
        if self.pre_norm: # NOT RUN: pre_norm=False by default and is not changed in model=TransformerModel() in train.py
            # src_ = self.norm1(src)
            print("not run")
        else: # this gets RUN
            src_ = src
        
        if isinstance(src_mask, tuple): # NOT RUN - AssertionError 
            # global attention setup
            assert not self.self_attn.batch_first # AssertionError when batch_first=True: not True = False  --> so batch_first must be False (and it is - default False is not changed in model=TransformerModel() in train.py)
            assert src_key_padding_mask is None # AssertionError when src_key_padding_mask=None --> so src_key_padding_mask must be not None (but it is None - default None is not changed)
            
            # I think this is not run as we get AssertionError: default src_key_padding_mask=None is not changed
            # so we actually do what's in else (elif also gets AssertionError fot the same reason)
            
            # global_src_mask, trainset_src_mask, valset_src_mask = src_mask

            # num_global_tokens = global_src_mask.shape[0]
            # num_train_tokens = trainset_src_mask.shape[0]

            # global_tokens_src = src_[:num_global_tokens]
            # train_tokens_src = src_[num_global_tokens:num_global_tokens+num_train_tokens]
            # global_and_train_tokens_src = src_[:num_global_tokens+num_train_tokens]
            # eval_tokens_src = src_[num_global_tokens+num_train_tokens:]


            # attn = partial(checkpoint, self.self_attn) if self.recompute_attn else self.self_attn

            # global_tokens_src2 = attn(global_tokens_src, global_and_train_tokens_src, global_and_train_tokens_src, None, True, global_src_mask)[0]
            # train_tokens_src2 = attn(train_tokens_src, global_tokens_src, global_tokens_src, None, True, trainset_src_mask)[0]
            # eval_tokens_src2 = attn(eval_tokens_src, src_, src_,
            #                         None, True, valset_src_mask)[0]

            # src2 = torch.cat([global_tokens_src2, train_tokens_src2, eval_tokens_src2], dim=0)
        elif isinstance(src_mask, int): # NOT RUN - AssertionError 
            assert src_key_padding_mask is None # AssertionError when src_key_padding_mask=None --> so src_key_padding_mask must be not None (but it is None - default None is not changed)
            # single_eval_position = src_mask
            # src_left = self.self_attn(src_[:single_eval_position], src_[:single_eval_position], src_[:single_eval_position])[0]
            # src_right = self.self_attn(src_[single_eval_position:], src_[:single_eval_position], src_[:single_eval_position])[0]
            # src2 = torch.cat([src_left, src_right], dim=0)
        else: # this gets RUN 
            if self.recompute_attn: # recompute_attn=False by default, and is not changed in model=TransformerModel() in train.py)
                # src2 = checkpoint(self.self_attn, src_, src_, src_, src_key_padding_mask, True, src_mask)[0]
                print("not run")
            else: # so we actually do this part
                src2 = self.self_attn(src_, src_, src_, attn_mask=src_mask,
                                      key_padding_mask=src_key_padding_mask)[0]
        
        src = src + self.dropout1(src2)
        
        if not self.pre_norm: # this gets RUN: pre_norm=False so not False is True
            src = self.norm1(src)

        if self.pre_norm: # NOT RUN: pre_norm=False
            src_ = self.norm2(src)
        else: # this gets RUN
            src_ = src
        
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src_))))
        src = src + self.dropout2(src2)

        if not self.pre_norm: # this gets RUN: pre_norm=False so not False is True
            src = self.norm2(src)
        
        return src


skipping


In [45]:
%%script echo skipping

torch.manual_seed(1)
src = torch.rand(10, 32, 512)
encoder_layer = TransformerEncoderLayer(d_model=512, nhead=4)
out = encoder_layer(src)

print(out.shape)
print(out[0,0,0:3])
# tensor([ 1.2991, -0.8532, -0.0118]) # when I run with full class definition

skipping


#### deleted

In [48]:
from functools import partial

from torch import nn
import torch
from torch.nn.modules.transformer import _get_activation_fn, Module, Tensor, Optional, MultiheadAttention, Linear, Dropout, LayerNorm
from torch.utils.checkpoint import checkpoint

# added by Ugne (before it showed error: F is not defined)
from torch.nn import functional as F

# commented out what's not run
class DelTransformerEncoderLayer(Module):

    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
                 layer_norm_eps=1e-5, batch_first=False, pre_norm=False,
                 device=None, dtype=None, recompute_attn=False) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
                                            **factory_kwargs)

        self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
        self.dropout_ch = Dropout(dropout) # dropout -> dropout_ch
        self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)

        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.pre_norm = pre_norm
        self.recompute_attn = recompute_attn

        self.activation = _get_activation_fn(activation)


    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        
        # multihead attention
        src_ = src
        src2 = self.self_attn(src_, src_, src_, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
        
        # add and normalize
        src = src + self.dropout1(src2)
        src = self.norm1(src)

        # feed forward
        src_ = src
        src2 = self.linear2(self.dropout_ch(self.activation(self.linear1(src_)))) # dropout -> dropout_ch
        
        # add and normalize
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        
        return src


In [49]:
torch.manual_seed(1)
src = torch.rand(10, 32, 512)

In [50]:
encoder_layer_del = DelTransformerEncoderLayer(d_model=512, nhead=4)
out_deleted = encoder_layer_del(src)

print(out_deleted.shape)
print(out_deleted[0,0,0:3]) # tensor([ 1.2991, -0.8532, -0.0118]) # when I run with only what I think it does

torch.Size([10, 32, 512])
tensor([ 1.2991, -0.8532, -0.0118], grad_fn=<SliceBackward>)


In [58]:
#comparison

print(out_full.shape)
print(out_deleted.shape)
print(out_full[0,0,0:3]) # tensor([ 1.2991, -0.8532, -0.0118]) # when I run with full class definition
print(out_deleted[0,0,0:3]) # tensor([ 1.2991, -0.8532, -0.0118]) # when I run with only what I think it does

torch.Size([10, 32, 512])
torch.Size([10, 32, 512])
tensor([ 1.2991, -0.8532, -0.0118], grad_fn=<SliceBackward>)
tensor([ 1.2991, -0.8532, -0.0118], grad_fn=<SliceBackward>)


### my trial 1

In [59]:
# data
torch.manual_seed(1)
src = torch.rand(10, 32, 512) # 10 batches where each has 32 datapoints so vecs of length = 512

print(src[0,0,0:3])

tensor([0.7576, 0.2793, 0.4031])


In [60]:
# setup

# passed into train() in train.py
emsize=512 #yes, same in the paper
nhead=4 #yes, same in the paper
nhid=2*emsize # #yes, same in the paper: 1024

# # def train() in train.py
# emsize=200 # function default 
# nhead=2 # function default
# nhid=200 # function default

# model = TransformerModel() in train.py
emsize = emsize
nhead = nhead
nhid = nhid

# class TransformerModel() in transformer.py
ninp = emsize # ninp - number of inputs
nhead = nhead
nhid = nhid

# class TransformerEncoderLayer() in layer.py
# d_model = ninp
# nhead = nhead
# dim_feedforward = nhid
d_model = 512
nhead = 4
dim_feedforward=2048
dropout=0.1
activation="relu"
layer_norm_eps=1e-5
batch_first=False
pre_norm=False
device=None
dtype=None
recompute_attn=False
factory_kwargs = {'device': device, 'dtype': dtype}
# def forward() in class TransformerEncoderLayer() in layer.py
src_mask = None
src_key_padding_mask = None

In [61]:
# Implementation of Feedforward model
linear1 = nn.Linear(d_model, dim_feedforward, **factory_kwargs)
dropout_changed = nn.Dropout(dropout) # changed variable name because of TypeError: '<' not supported between instances of 'Dropout' and 'int'
linear2 = nn.Linear(dim_feedforward, d_model, **factory_kwargs)

norm1 =nn. LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
dropout1 = nn.Dropout(dropout)
dropout2 = nn.Dropout(dropout) 
pre_norm = pre_norm
recompute_attn = recompute_attn

activation = _get_activation_fn(activation)

In [68]:
self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, **factory_kwargs)
torch.manual_seed(1)
src_ = src
        
src2 = self_attn(src_, src_, src_, attn_mask=src_mask,
                                key_padding_mask=src_key_padding_mask)[0]

src = src + dropout1(src2)


src = norm1(src)


src_ = src

src2 = linear2(dropout_changed(activation(linear1(src_)))) # changed variable name because of TypeError: '<' not supported between instances of 'Dropout' and 'int'
src = src + dropout2(src2)

src = norm2(src)

In [69]:
print(src.shape)
print(src[0,0,0:3])
# tensor([-0.4884, -0.7499,  0.4478])

torch.Size([10, 32, 512])
tensor([ 1.3112, -0.7075,  0.6591], grad_fn=<SliceBackward>)


### my trial 2

In [74]:
d_model = 512
nhead = 4
dim_feedforward=2048
dropout=0.1
activation="relu"
layer_norm_eps=1e-5
batch_first=False
pre_norm=False
device=None
dtype=None
recompute_attn=False
        
factory_kwargs = {'device': device, 'dtype': dtype}

self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, **factory_kwargs)

# Implementation of Feedforward model
linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
dropout_cha = Dropout(dropout)
linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)

norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
dropout1 = Dropout(dropout)
dropout2 = Dropout(dropout)
pre_norm = pre_norm
recompute_attn = recompute_attn

activation = _get_activation_fn(activation)


torch.manual_seed(1)
src = torch.rand(10, 32, 512)

src_mask = None
src_key_padding_mask = None

src_ = src

src2 = self_attn(src_, src_, src_, attn_mask=src_mask,
                                key_padding_mask=src_key_padding_mask)[0]

src = src + dropout1(src2)

src = norm1(src)

src_ = src

src2 = linear2(dropout_cha(activation(linear1(src_))))
src = src + dropout2(src2)

src = norm2(src)

src_out_trial3 = src

In [76]:
print(src_out_trial3.shape)
print(src_out_trial3[0,0,0:3]) # tensor([ 0.7857, -1.1753,  0.0188])

torch.Size([10, 32, 512])
tensor([ 0.7857, -1.1753,  0.0188], grad_fn=<SliceBackward>)


# Globally

There's a function `train()` in train.py

One of its arguments is encoder_generator - a class (Linear/MLP/Positional from encoders.py)

Then in this function train() we create an object of class encoder_generator: `encoder` = encoder_generator(dl.num_features, emsize)

pvz.: 

* encoder_generator = encoders.Linear
* encoder = encoders.Linear(num_features=[default 512], emzise=[depends on the prior])

With this object encoder we can encode a given dataset x, pvz give 100 datapoints with 20 features in each of them (then emsize=20) and if we do x_encoded=encoder.forward(x) then we get an encoded dataset x_encoded with 100 datapoints where each datapoint now has 512 elements in it

Then this function train() sets model = TransformerModel() - an object of class `TransformerModel()` from transformer.py



There's a class `TransformerModel()` in transformer.py

Within this class there's a function `forward()` which uses arguments `encoder` and `y_encoder` which are the arguments for this class

Then funciton forward() basically performs encoding of source datapoints (x,y):

* x_src = self.encoder(x_src)
* y_src = self.y_encoder(y_src.unsqueeze(-1) if len(y_src.shape) < len(x_src.shape) else y_src)