# Transformer


In [4]:
## Includes all libraries
import numpy as np
import scipy as sp
import pandas as pd
from utilities import * 
import matplotlib.pyplot as plt
import torch as th
import Layer as nn

-> Linear Layer

Dropout Layer

Activation Layers

In [7]:
class RElu:
    def __init__(self,inplace:bool = False):
        self.inplace=inplace
    
    def __call__(self, input: th.Tensor):
        if self.inplace:
            input=input*(input>0)
            return input
        else:
            tmp=input*(input>0)
            return tmp

class Sigmoid:
    def __init__(self,inplace:bool = False):
        self.inplace=inplace
    
    def __call__(self, input: th.Tensor):
        if self.inplace:
            input=1 / (1 + th.exp(-input))
            return input
        else:
            tmp=1 / (1 + th.exp(-input))
            return tmp

class LRElu:
    def __init__(self,inplace:bool = False):
        self.inplace=inplace
    
    def __call__(self, input: th.Tensor,alpha:float):
        if self.inplace:
            input=th.maximum(input,input*alpha)
        else:
            tmp=th.maximum(input,input*alpha)
            return tmp

class Elu:
    def __init__(self,inplace:bool = False):
        self.inplace=inplace
    
    def __call__(self, input: th.Tensor,alpha:float):
        if self.inplace:
            input=th.maximum(input,(th.exp(input)-1)*alpha)
        else:
            tmp=th.maximum(input,(th.exp(input)-1)*alpha)
            return tmp

class HardSigmoid:
    def __init__(self,inplace:bool = False):
        self.inplace=inplace
    
    def __call__(self, input: th.Tensor):
        if self.inplace:
            input=((input/6)+0.5)*(th.logical_and(input<=3,input>=-3))+input*(th.logical_and(input>=3,input<=-3))
        else:
            tmp=((input/6)+0.5)*(th.logical_and(input<=3,input>=-3))+input*(th.logical_and(input>=3,input<=-3))
            return tmp

Various Types of Attention Mechanism

In [11]:
class ScaledDotProductAttention:
    def __init__(self,drop):
        self.dropout=nn.Dropout(drop)

    def __call__(self,q:th.Tensor,k:th.Tensor,v:th.Tensor,valid_lens=None):
        d = q.shape[-1]
        scores=th.bmm(q,k.transpose(1,2))/th.sqrt(d)
        self.attention_weights = masked_softmax(scores, valid_lens)
        return th.bmm(self.dropout(self.attention_weights),v)

class AdditiveAttention:
    def __init__(self,num_hiddens,drop):
        self.linearq=nn.LazyLinear(num_hiddens,bias=False)
        self.lineark=nn.LazyLinear(num_hiddens,bias=False)
        self.linearv=nn.LazyLinear(1,bias=False)
        self.dropout=nn.Dropout(drop)

    def __call__(self,q:th.Tensor,k:th.Tensor,v:th.Tensor,valid_lens:th.Tensor=None):
        queries,keys=self.linear1(q),self.linear2(k)
        features = queries.unsqueeze(2) + keys.unsqueeze(1)
        features = th.tanh(features)
        scores = self.linear3(features).squeeze(-1)
        self.attention_weights = masked_softmax(scores, valid_lens)
        return th.bmm(self.dropout(self.attention_weights),v)

Muti Headed Attention

In [12]:
class MultiHeadAttention:
    def __init__(self,num_heads,num_hiddens,drop,bias=False,**kwargs):
        self.num_heads=num_heads
        self.attention=ScaledDotProductAttention(drop)
        self.linearq=nn.LazyLinear(num_hiddens,bias)
        self.lineark=nn.LazyLinear(num_hiddens,bias)
        self.linearv=nn.LazyLinear(num_hiddens,bias)
        self.linearo=nn.LazyLinear(num_hiddens,bias)
    
    def transpose_qkv(self,X):
        X = X.reshape(X.shape[0],X.shape[1],self.num_heads,-1)
        X = X.permute(0,2,1,3)
        return X.reshape(-1,X.shape[2],X.shape[1])
    
    def transpose_output(self,X):
        X = X.reshape(X.shape[0],X.shape[1],self.num_heads,-1)
        X = X.permute(0,2,1,3)
        return X.reshape(-1,X.shape[2],X.shape[3])
    
    def transpose_output(self,X):
        X = X.reshape(-1,self.num_heads,X.shape[1],X.shape[2])
        X = X.permute(0,2,1,3)
        return X.reshape(X.shape[0],X.shape[1],-1)

    def __call__(self,q:th.Tensor,k:th.Tensor,v:th.Tensor,valid_lens):

        queries=self.transpose_qkv(self.linearq(q))
        key=self.transpose_qkv(self.lineark(k))
        values=self.transpose_qkv(self.linearv(v))

        if valid_lens is not None:
            valid_lens = th.repeat_interleave(
                valid_lens, repeats=self.num_heads, dim=0)
        self.attentionweights=self.attention(queries,key,values,valid_lens)
        output_concat=self.transpose_output(self.attentionweights)
        return self.linearo(output_concat)



Layer Normalization

In [None]:
class LayerNorm:
    def __init__(self,normalized_shape, eps=1e-05, elementwise_affine: bool=True, bias: bool=True):
        self.shape=tuple(normalized_shape)
        self.eps=eps
        self.elementwise_affine=elementwise_affine
        self.gamma=th.ones(normalized_shape)
        self.beta=th.zeros(normalized_shape)
        