In [1]:
import os
import math
import datetime
import time

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
from torch.utils import data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torchvision import transforms as T
from torchvision.datasets import ImageFolder
from torch.autograd import Variable
from torchvision.utils import save_image
from torch.backends import cudnn


In [19]:
class ScaledDPAttention(nn.Module):
    def __init__(self, dim_key, dim_val, masked=False):
        super().__init__()
        self.dim_key = dim_key
        self.dim_val = dim_val
        self.masked = masked
        self.scale = math.sqrt(self.dim_val)
    
    def forward(q, k, v):
        B, L, D = q.size()
        output = torch.matmul(q, k)
        output = torch.div(output, self.scale)
        if self.masked:
            mask = (-1*torch.ones(3,3)*float('inf')).triu(1)
            output += mask
        output = output + q.size()[1]
        outout = nn.Softmax(output, dim = 1)
        output = torch.matmul(output, v)
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, dim_model, dim_key, dim_val, h):
        super().__init__()
        self.h = h
        self.dim_model = dim_model
        self.dim_key = dim_key
        self.dim_val = dim_val
        
        v_layers = []
        k_layers = []
        q_layers = []
        attention_layers = []
        for i in range(self.h):
            q_layers.append(nn.Linear(in_featurues=dim_model, out_features=dim_key, bias=False))
            k_layers.append(nn.Linear(in_featurues=dim_model, out_features=dim_key, bias=False))
            v_layers.append(nn.Linear(in_featurues=dim_model, out_features=dim_val, bias=False))
            attention_layers.append(ScaledDPAttention(dim_key, dim_val))
        self.linear = nn.Linear(in_features = h*dim_val, out_features=dim_model, bias=False)
    
    def forward(Q, K, V):
        outs = []
        for i in range(self.h):
            q = q_layers[i](Q)
            k = k_layers[i](K)
            v = v_layers[i](V)
            o = attention_layers[i](q, k, v)
            outs.append(o)
        output = torch.cat(outs, dim=2) # check dimenson 
        return self.linear(output)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, n_dim):
        self.n_dim = n_dim
        self.get_pos = lambda pos : [pos/(10000**(2*(i//2)/n_dim)) for i in range(n_dim)]
        
    def forward(batch_size, embedding_length):
        code = np.array([self.get_pos[i] for i in range(embedding_length)])
        position_encoding = np.zeros((embedding_length, n_dim))
        position_encoding[:, 0::2] = np.sin(code[:, 0::2])
        position_encoding[:, 1::2] = np.cos(code[:, 1::2])
        return np.tile(position_encoding, (batch_size, 1))

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, dim_model, dim_key, dim_val, dim_hidden, h=8):
        super().__init__()        
        self.attention = MultiHeadAttention(dim_model, dim_key, dim_val, h)
        self.FFN = nn.Sequential(
            nn.Linear(in_feature=dim_model, out_feature=hidden_size),
            nn.ReLU(),
            nn.Linear(in_feawture=hidden_size, out_feature=dim_model)
        )
        self.norm = nn.LayerNorm(dim_model)
    
    def forward(X):
        A = self.attention(Q = X, K = X, V = X)
        A = self.norm(A + X)
        F = self.FFN(A)
        return self.norm(F + A)
    


In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, dim_model, dim_key, dim_val, dim_hidden, h=8):
        super().__init__()
        self.masked_attention = MultiHeadAttention(dim_model, dim_key, dim_val, h, masked=True)
        self.attention = MultiHeadAttention(dim_model, dim_key, dim_val, h, masked=False)
        self.FFN = nn.Sequential(
            nn.Linear(in_feature=dim_model, out_feature=hidden_size),
            nn.ReLU(),
            nn.Linear(in_feawture=hidden_size, out_feature=dim_model)
        )
        self.norm = nn.LayerNorm(dim_model)
    
    def forward(inputs): # (X, encoder_feature)
        X, features = inputs
        masked_A = self.masked_attention(Q = X, K = X, V = X)
        masked_A = self.norm(masked_A + X)
        
        A = self.attention(Q = masked_A, K = features, V = features)
        A = self.norm(A + masked_A)
        
        F = self.FFN(A)
        F = self.norm(F + A)
        return (F, features)

In [3]:
class Transformer(nn.Module):
    def __init__(self, dim_model, dim_key, dim_val, dim_hidden, N=6, h=8):
        super().__init__()
        encoder_layers = []
        decoder_layers = []
        
        for i in range(N):
            encoder_layers.append(EncoderLayer(dim_model, dim_key, dim_val, dim_hidden, h))
            decoder_layers.append(DecoderLayer(dim_model, dim_key, dim_val, dim_hidden, h)) 

        self.Encoder = nn.Sequential(*encoder_layers)
        self.Decoder = nn.Sequential(*decoder_layers)        
        self.FC = nn.Linear() # weights? bias?
        self.sm = nn.Softmax(dim=1)
    
    def forward(inputs, outputs):
        features = self.Encoder(inputs)
        d = self.Decoder((outputs, features))
        return self.sm(self.FC(d))

NameError: name 'nn' is not defined