In [1]:
import os
train_path = '../../raw_train_artifact'
test_path = '../../raw_test_artifact'
embedding_path = '../../embedding_artifact'
input_path = '../../input_artifact'
input_split_path = '../../input_artifact/input_split'
model_path = '../../model_artifact'
output_path = '../../output_artifact'

In [2]:
import sys
import gc
gc.enable()
import time
import re

import numpy as np
import pandas as pd
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',2000)
pd.set_option('precision',5)
pd.set_option('float_format', '{:.5f}'.format)

import tqdm
import joblib
import json

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score
from gensim.models import Word2Vec
import torch
from torch import nn
import torch.nn.functional as F

In [3]:
import logging

log_path = '[1.1]Hugging Face Transformer Experiment.log'
    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(levelname)-s: %(message)s', datefmt='%H:%M:%S')

fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)

logger.info(f'Restart notebook\n==========================\n{time.ctime()}\n==========================')

17:13:22 INFO: Restart notebook
Tue Jun  9 17:13:22 2020


In [4]:
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# logger.info('Device in Use: {}'.format(DEVICE))
# torch.cuda.empty_cache()
# t = torch.cuda.get_device_properties(DEVICE).total_memory/1024**3
# c = torch.cuda.memory_cached(DEVICE)/1024**3
# a = torch.cuda.memory_allocated(DEVICE)/1024**3
# logger.info('CUDA Memory: Total {:.2f} GB, Cached {:.2f} GB, Allocated {:.2f} GB'.format(t,c,a))

In [5]:
def get_torch_module_num_of_parameter(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params

## LSTM EXP

In [7]:
lstm_1 = nn.LSTM(input_size=2, hidden_size=2, bidirectional=True, batch_first=True)
lstm_2 = nn.LSTM(input_size=4, hidden_size=4)

In [8]:
a = [torch.tensor([[1,1],[2,2],[3,3]]).float(), torch.tensor([[1,1],[2,2]]).float()]
b = torch.nn.utils.rnn.pad_sequence(a, batch_first=True)
inp = torch.nn.utils.rnn.pack_padded_sequence(b, batch_first=True, lengths=[3,2], enforce_sorted=False)

In [9]:
out, (h, c) = lstm_1(inp)

#nx, _ = lstm_2(out, (h,c))

In [21]:
h.view(1, 2, 2, 2)[:,1,:,:].shape

torch.Size([1, 2, 2])

In [17]:
h

tensor([[[-0.7098,  0.6678],
         [-0.4610,  0.5054]],

        [[ 0.2380,  0.2383],
         [ 0.2325,  0.2292]]], grad_fn=<IndexSelectBackward>)

In [14]:
h.permute(1,0,2).reshape(2,4)

tensor([[-0.7098,  0.6678,  0.2380,  0.2383],
        [-0.4610,  0.5054,  0.2325,  0.2292]], grad_fn=<UnsafeViewBackward>)

In [15]:
c.permute(1,0,2).reshape(2,4)

tensor([[-1.0232,  0.8452,  0.7446,  0.5546],
        [-0.5838,  0.6243,  0.7042,  0.5381]], grad_fn=<UnsafeViewBackward>)

In [12]:
nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0]

tensor([[[-0.1469,  0.2804,  0.2380,  0.2383],
         [-0.4610,  0.5054,  0.2307,  0.2592],
         [-0.7098,  0.6678,  0.2113,  0.2379]],

        [[-0.1469,  0.2804,  0.2325,  0.2292],
         [-0.4610,  0.5054,  0.2089,  0.2109],
         [ 0.0000,  0.0000,  0.0000,  0.0000]]], grad_fn=<IndexSelectBackward>)

In [32]:
nn.utils.rnn.pad_packed_sequence(nx, batch_first=True)[0]

tensor([[[-0.0913, -0.1721],
         [ 0.1276, -0.2379],
         [ 0.1945, -0.2649]],

        [[-0.0187, -0.1936],
         [ 0.1482, -0.2489],
         [ 0.0000,  0.0000]],

        [[ 0.0867, -0.2293],
         [ 0.1775, -0.2689],
         [ 0.1964, -0.3072]]], grad_fn=<IndexSelectBackward>)

## Multi-Seq GNMT

In [14]:
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

class Res_LSTM_Layer(nn.Module):
	"""
	Multi-layer unidirectional LSTM with residual connection.
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(Res_LSTM_Layer, self).__init__(**kwargs)
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		for index in range(n_layer):
			setattr(self, 'lstm_{}'.format(index), nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True, bias=True))
			setattr(self, 'dropout_{}'.format(index), nn.Dropout(p=dropout))

	def forward(self, inp, inp_len):
		_, total_length, _ = inp.shape
		for index in range(self.n_layer):
			out = nn.utils.rnn.pack_padded_sequence(inp, batch_first=True, lengths=inp_len, enforce_sorted=False)
			out, _ = getattr(self, 'lstm_{}'.format(index))(out)
			out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0]
			inp = getattr(self, 'dropout_{}'.format(index))(torch.add(out, inp))
		return inp

class GNMT_Encoder_Layer(nn.Module):
	"""
	Google Neural Machine Translation - Encoder
	"""
	def __init__(self, input_size, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(GNMT_Encoder_Layer, self).__init__(**kwargs)
		assert n_layer >= 3

		self.input_size = input_size
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		self.l1_bilstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True, bias=True, bidirectional=True)
		self.l1_dropout = nn.Dropout(p=dropout)
		self.l2_lstm = nn.LSTM(input_size=hidden_size*2, hidden_size=hidden_size, bias=True)
		self.l2_dropout = nn.Dropout(p=dropout)
		self.res_lstm = Res_LSTM_Layer(n_layer-2, hidden_size, dropout=dropout)

	def forward(self, inp, inp_len):
		batch_size, total_length, _ = inp.shape
		inp = nn.utils.rnn.pack_padded_sequence(inp, batch_first=True, lengths=inp_len, enforce_sorted=False)
		out, (h, c) = self.l1_bilstm(inp)
		backward_hidden_state = h.view(1, 2, batch_size, self.hidden_size)[:,1,:,:].squeeze(0)              # (num_direction, batch_size, enc_hidden_size)
		backward_cell_state = c.view(1, 2, batch_size, self.hidden_size)[:,1,:,:].squeeze(0)                # (num_direction, batch_size, enc_hidden_size)
		out = self.l1_dropout(nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0])
		out = nn.utils.rnn.pack_padded_sequence(out, batch_first=True, lengths=inp_len, enforce_sorted=False)
		out, _ = self.l2_lstm(out)
		out = self.l2_dropout(nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0])
		out = self.res_lstm(out, inp_len)
		return out, backward_hidden_state, backward_cell_state

class Additive_Attention_Layer(nn.Module):
	"""
	Additive attention used in GNMT
	"""
	def __init__(self, hidden_size, **kwargs):
		super(Additive_Attention_Layer, self).__init__(**kwargs)
		self.hidden_size = hidden_size

		self.W = nn.Linear(hidden_size*2, hidden_size)
		self.tanh = nn.Tanh()
		self.V = nn.Parameter(torch.Tensor(1, hidden_size))
		self.softmax = nn.Softmax(dim=2)

		nn.init.normal_(self.V, 0, 0.1)

	def forward(self, query, values, mask):
		"""
		: query:  (batch_size, hidden_size)
		: values: (batch_size, seq_len, hidden_size)
		: mask:   (batch_size, seq_len)
		"""
		batch_size, seq_len, hidden_size = values.shape

		query = query.unsqueeze(1).expand(-1, seq_len, -1)
		score = self.tanh(self.W(torch.cat((query, values), dim=2)))                              # (batch_size, seq_len, hidden_size)
		score = torch.bmm(self.V.squeeze(1).expand(batch_size, -1, -1), score.permute(0,2,1))     # (batch_size, 1, seq_len)
		score = self.softmax(torch.add(score, mask.unsqueeze(1)))                                 # (batch_size, 1, seq_len)
		context = torch.bmm(score, values).squeeze(1)                                             # (batch_size, hidden_size)

		return context

class Res_Attn_LSTM_Layer(nn.Module):
	"""
	Multi-layer unidirectional LSTM with residual connection and attention.
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(Res_Attn_LSTM_Layer, self).__init__(**kwargs)
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		for index in range(n_layer):
			setattr(self, 'lstm_{}'.format(index), nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, batch_first=True, bias=True))
			setattr(self, 'dropout_{}'.format(index), nn.Dropout(p=dropout))

	def forward(self, hidden_states, context_vectors, inp_len):
		_, total_length, _ = hidden_states.shape
		for index in range(self.n_layer):
			out = nn.utils.rnn.pack_padded_sequence(torch.cat((hidden_states, context_vectors), dim=2), batch_first=True, lengths=inp_len, enforce_sorted=False)
			out, _ = getattr(self, 'lstm_{}'.format(index))(out)
			out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0]
			hidden_states = getattr(self, 'dropout_{}'.format(index))(torch.add(out, hidden_states))
		return hidden_states

class GNMT_Decoder_Layer(nn.Module):
	"""
	Google Neural Machine Translation - Decoder
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, device=None, **kwargs):
		super(GNMT_Decoder_Layer, self).__init__(**kwargs)
		assert n_layer>=3

		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout
		self.device = device if device else torch.device('cpu')

		self.attention_calc = Additive_Attention_Layer(hidden_size)
		self.l1_lstm_cell = nn.LSTMCell(input_size=2*hidden_size, hidden_size=hidden_size, bias=True)
		self.l1_dropout = nn.Dropout(p=dropout)
		self.l2_lstm = nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, batch_first=True, bias=True)
		self.l2_dropout = nn.Dropout(p=dropout)
		self.res_attn_lstm = Res_Attn_LSTM_Layer(n_layer-2, hidden_size, dropout=dropout)

	def get_attention_mask(self, inp_len, batch_size, seq_len):
		mask = np.ones((batch_size, seq_len))
		for index, l in enumerate(inp_len):
			mask[index,:l] = 0
		mask *= -1e9
		return torch.from_numpy(mask).float().to(self.device)

	def forward(self, enc_hidden_states, backward_hidden_state, backward_cell_state, inp_len):
		batch_size, seq_len, _ = enc_hidden_states.shape
		attention_mask = self.get_attention_mask(inp_len, batch_size, seq_len)
		enc_hidden_states = enc_hidden_states.permute(1,0,2)                                                                                          # (seq_len, batch_size, hidden_size)
		decoder_hidden_states_buf =  []
		decoder_context_vectors_buf = []
		decoder_h, decoder_c = backward_hidden_state, backward_cell_state
		for step in range(seq_len):
			inp = enc_hidden_states[step]                        
			context_vector = self.attention_calc(inp, enc_hidden_states.permute(1,0,2), attention_mask)                                               # (batch_size, hidden_size)
			decoder_context_vectors_buf.append(context_vector)
			inp = torch.cat((inp, context_vector), dim=1)                                                                                                    # (batch_size, 2*hidden_size)
			decoder_h, decoder_c = self.l1_lstm_cell(inp, (decoder_c, decoder_h))
			decoder_hidden_states_buf.append(decoder_h)
		decoder_context_vectors = torch.stack(decoder_context_vectors_buf, dim=1)                                                                     # (batch_size, seq_len, hidden_size)
		decoder_hidden_states = torch.stack(decoder_hidden_states_buf, dim=1)                                                                         # (batch_size, seq_len, hidden_size)
		decoder_hidden_states = self.l1_dropout(torch.cat((decoder_hidden_states, decoder_context_vectors), dim=2))                                   # (batch_size, seq_len, 2*hidden_size)
		decoder_hidden_states = nn.utils.rnn.pack_padded_sequence(decoder_hidden_states, batch_first=True, lengths=inp_len, enforce_sorted=False)
		decoder_hidden_states, _ = self.l2_lstm(decoder_hidden_states)
		decoder_hidden_states = nn.utils.rnn.pad_packed_sequence(decoder_hidden_states, batch_first=True, total_length=seq_len)[0]
		decoder_hidden_states = self.l2_dropout(decoder_hidden_states)
		decoder_hidden_states = self.res_attn_lstm(decoder_hidden_states, decoder_context_vectors, inp_len)                                                    # (batch_size, seq_len, hidden_size)
		return decoder_hidden_states

class GNMT_Extraction_Layer(nn.Module):
	"""
	Seq2Seq feature extration layer based on Google Neural Machine Translation.
	"""
	def __init__(self, embed_size, hidden_size, n_enc_layer, n_dec_layer, device=None, dropout=0.1, **kwargs):
		super(GNMT_Extraction_Layer, self).__init__(**kwargs)
		self.embed_size = embed_size
		self.hidden_size = hidden_size
		self.n_enc_layer = n_enc_layer
		self.n_dec_layer = n_dec_layer
		self.device = device if device else torch.device('cpu')
		self.dropout = dropout

		self.encoder = GNMT_Encoder_Layer(embed_size, n_enc_layer, hidden_size)
		self.decoder = GNMT_Decoder_Layer(n_dec_layer, hidden_size, device=self.device)

	def forward(self, inp, inp_len):
		encoder_hidden_states, backward_hidden_state, backward_cell_state = self.encoder(inp, inp_len)
		decoder_hidden_states = self.decoder(encoder_hidden_states, backward_hidden_state, backward_cell_state, inp_len)
		return decoder_hidden_states

class MLP_Classification_Layer(nn.Module):
	"""
	Multilayer Perception Classification Layer
	- Layer 1: Linear + Batchnorm + ReLU + Dropout
	- Layer 2: Linear + Batchnorm + ReLU + Dropout
	- Layer 3: Linear
	"""
	def __init__(self, inp_size, out_size, dropout=0.4, **kwargs):
		super(MLP_Classification_Layer, self).__init__(**kwargs)
		self.inp_size = inp_size
		self.out_size = out_size
		self.dropout = dropout
		
		self.mlp_1 = nn.Linear(inp_size, 2048)
		self.batchnorm_1 = nn.BatchNorm1d(2048)
		self.mlp_dropout_1 = nn.Dropout(p=dropout)
		self.mlp_2 = nn.Linear(2048, 1024)
		self.batchnorm_2 = nn.BatchNorm1d(1024)
		self.mlp_dropout_2 = nn.Dropout(p=dropout)
		self.mlp_3 = nn.Linear(1024, out_size)
		
	def forward(self, inp):
		mlp_out = self.mlp_1(inp)                                                         # (batch_size, 1024)
		mlp_out = self.mlp_dropout_1(F.relu(self.batchnorm_1(mlp_out)))                   # (batch_size, 1024)
		mlp_out = self.mlp_2(mlp_out)                                                     # (batch_size, 512)
		mlp_out = self.mlp_dropout_2(F.relu(self.batchnorm_2(mlp_out)))                   # (batch_size, 512)
		mlp_out = self.mlp_3(mlp_out)                                                     # (batch_size, out_size)
		return mlp_out   

class Multi_Seq_GNMT_Classifier(nn.Module):
	"""
	Use GNMT for Seq2Seq feature extraction, apply max pooling & pick last state, then use multilayer perception for classification
	- Multi sequence input version
	"""
	def __init__(self, out_size, embed_size, hidden_size, n_enc_layer, n_dec_layer, device=None, rnn_dropout=0.1, dnn_dropout=0.4, **kwargs):
		super(Multi_Seq_GNMT_Classifier, self).__init__()
		assert isinstance(embed_size, list) and isinstance(hidden_size, list) and len(embed_size)==len(hidden_size)
		self.out_size = out_size
		self.embed_size = embed_size
		self.hidden_size = hidden_size
		self.n_enc_layer = n_enc_layer
		self.n_dec_layer = n_dec_layer
		self.device = device if device else torch.device('cpu')
		self.rnn_dropout = rnn_dropout
		self.dnn_dropout = dnn_dropout

		self.n_extraction = len(embed_size)
		self.mlp_inp_size = sum(map(lambda x:2*x, hidden_size))

		for index, (e_size, h_size) in enumerate(zip(embed_size, hidden_size)):
			setattr(self, 'GNMT_layer_{}'.format(index), GNMT_Extraction_Layer(e_size, h_size, n_enc_layer, n_dec_layer, device=device, dropout=rnn_dropout))
		self.mlp_layer = MLP_Classification_Layer(self.mlp_inp_size, out_size, dropout=dnn_dropout)

	def forward(self, *args):
		assert len(args)==self.n_extraction+1
		out_buf, inp_len = [], args[-1]
		for index, inp in enumerate(args[:-1]):
			inp = getattr(self, 'GNMT_layer_{}'.format(index))(inp, inp_len)  
			max_pool_buf, last_buf = [], []
			for batch_idx, l in enumerate(inp_len):
				max_pool_buf.append(torch.max(inp[batch_idx,:l], dim=0)[0])
				last_buf.append(inp[batch_idx, l-1])
			out_buf.append(torch.cat((torch.stack(max_pool_buf, dim=0), torch.stack(last_buf, dim=0)), dim=1)) 
		out = torch.cat(out_buf, dim=1)
		out = self.mlp_layer(out)                                                     
		return out

In [20]:
model = Multi_Seq_GNMT_Classifier(10,[256,256,128,128],[256,256,256,256],8,8)
inp_1 = torch.ones(5, 10, 256).float()
inp_2 = torch.ones(5, 10, 128).float()
out = model(*[inp_1, inp_1, inp_2, inp_2, np.arange(1,6)])

In [21]:
out.shape

torch.Size([5, 10])

In [22]:
get_torch_module_num_of_parameter(model)

51541002

## GNMT

In [29]:
class LSTM_Attention_Extraction_Layer(nn.Module):
	"""
	LSTM attention feature extration layer, idea from Google Neural Machine Translation
	- Layer 1: BiLSTM + Dropout + Layernorm
	- Layer 2: LSTM with Residual Connection + Dropout + Layernorm
	- Layer 3: LSTM + Batchnorm + ReLU + Dropout
	"""
	def __init__(self, embed_size, enc_hidden_size, dec_hidden_size, rnn_dropout=0.1, dnn_dropout=0.4, **kwargs):
		super(LSTM_Attention_Extraction_Layer, self).__init__(**kwargs)
		self.embed_size = embed_size
		self.enc_hidden_size = enc_hidden_size
		self.dec_hidden_size = dec_hidden_size
		self.rnn_dropout = rnn_dropout
		self.dnn_dropout = dnn_dropout

		self.enc_bilstm_1 = nn.LSTM(input_size=embed_size, hidden_size=enc_hidden_size, batch_first=True, bias=True, bidirectional=True)
		self.rnn_dropout_1 = nn.Dropout(p=rnn_dropout)
		self.enc_lstm_2 = nn.LSTM(input_size=enc_hidden_size*2, hidden_size=enc_hidden_size, bias=True)
		self.rnn_dropout_2 = nn.Dropout(p=rnn_dropout)
		self.enc_lstm_3 = nn.LSTM(input_size=enc_hidden_size, hidden_size=enc_hidden_size, bias=True)
		self.rnn_dropout_3 = nn.Dropout(p=rnn_dropout)
		self.enc_lstm_4 = nn.LSTM(input_size=enc_hidden_size, hidden_size=enc_hidden_size, bias=True)
		self.rnn_dropout_4 = nn.Dropout(p=rnn_dropout)


	def forward(self, inp_embed, inp_last_idx):
		batch_size = inp_embed.shape[0]
		inp_embed = torch.nn.utils.rnn.pack_padded_sequence(inp_embed, batch_first=True, lengths=inp_last_idx+1, enforce_sorted=False)
		out, (h, c) = self.enc_bilstm_1(inp_embed)
		backward_hidden_state = h.view(1, 2, batch_size, self.enc_hidden_size)[:,1,:,:]                                                # (num_direction, batch_size, enc_hidden_size)
		backward_cell_state = c.view(1, 2, batch_size, self.enc_hidden_size)[:,1,:,:]                                                  # (num_direction, batch_size, enc_hidden_size)
		out_unpacked = self.rnn_dropout_1(nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0])                                  # (batch_size, seq_len, 2*enc_hidden_size)
		out = torch.nn.utils.rnn.pack_padded_sequence(out_unpacked, batch_first=True, lengths=inp_last_idx+1, enforce_sorted=False)
		out, _ = self.enc_lstm_2(out)
		out_unpacked = self.rnn_dropout_2(nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0])                                  # (batch_size, seq_len, enc_hidden_size)
		out = torch.nn.utils.rnn.pack_padded_sequence(out_unpacked, batch_first=True, lengths=inp_last_idx+1, enforce_sorted=False)
		out, _ = self.enc_lstm_3(out)
		out_unpacked = self.rnn_dropout_3(torch.add(nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0], out_unpacked))         # (batch_size, seq_len, enc_hidden_size)
		return out_unpacked  

In [31]:
model = LSTM_Attention_Extraction_Layer(128, 128, 128)
inp = torch.ones(10, 10, 128).float()
model(inp, np.arange(10)).shape

torch.Size([10, 10, 128])

In [76]:
class Additive_Attention(nn.Module):
	"""
	Additive attention used in GNMT
	"""
	def __init__(self, hidden_size, **kwargs):
		super(Additive_Attention, self).__init__(**kwargs)
		self.hidden_size = hidden_size

		self.W = nn.Linear(hidden_size*2, hidden_size)
		self.tanh = nn.Tanh()
		self.V = nn.Parameter(torch.Tensor(1, hidden_size))
		self.softmax = nn.Softmax(dim=2)

		nn.init.normal_(self.V, 0, 0.1)

	def forward(self, query, values, mask):
		"""
		: query:  (batch_size, hidden_size)
		: values: (batch_size, seq_len, hidden_size)
		: mask:   (batch_size, seq_len)
		"""
		batch_size, seq_len, hidden_size = values.shape

		query = query.unsqueeze(1).expand(-1, seq_len, -1)
		score = self.tanh(self.W(torch.cat((query, values), dim=2)))                              # (batch_size, seq_len, hidden_size)
		score = torch.bmm(self.V.squeeze(1).expand(batch_size, -1, -1), score.permute(0,2,1))     # (batch_size, 1, seq_len)
		score = self.softmax(torch.add(score, mask.unsqueeze(1)))                                 # (batch_size, 1, seq_len)
		context = torch.bmm(score, values).squeeze(1)                                             # (batch_size, hidden_size)

		return context

In [86]:
p1 = np.random.standard_normal((2, 10, 10))

model = Additive_Attention(10)
query = torch.from_numpy(np.random.standard_normal((2, 10))).float()
values_1 = torch.from_numpy(np.concatenate([p1, np.random.standard_normal((2, 10, 10))], axis=1)).float()
values_2 = torch.from_numpy(np.concatenate([p1, np.random.standard_normal((2, 10, 10))], axis=1)).float()
mask = torch.from_numpy(np.concatenate([np.zeros((2,9)), np.ones((2,11))], axis=1)).float()*-1e9
s1 = model(query, values_1, mask)    
s2 = model(query, values_2, mask)

In [158]:
help(torch.cat)

Help on built-in function cat:

cat(...)
    cat(tensors, dim=0, out=None) -> Tensor
    
    Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
    All tensors must either have the same shape (except in the concatenating
    dimension) or be empty.
    
    :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
    and :func:`torch.chunk`.
    
    :func:`torch.cat` can be best understood via examples.
    
    Args:
        tensors (sequence of Tensors): any python sequence of tensors of the same type.
            Non-empty tensors provided must have the same shape, except in the
            cat dimension.
        dim (int, optional): the dimension over which the tensors are concatenated
        out (Tensor, optional): the output tensor.
    
    Example::
    
        >>> x = torch.randn(2, 3)
        >>> x
        tensor([[ 0.6580, -1.0969, -0.4614],
                [-0.1034, -0.5790,  0.1497]])
        >>> torch.cat((x, x, x), 0)


In [157]:
get_attention_mask(np.arange(1,6),5,10).shape

torch.Size([5, 10])

In [88]:
s1.shape

torch.Size([2, 10])

In [144]:
class Res_LSTM_Layer(nn.Module):
	"""
	Multi-layer unidirectional with residual connection.
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(Res_LSTM_Layer, self).__init__(**kwargs)
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		for index in range(n_layer):
			setattr(self, 'lstm_{}'.format(index), nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True, bias=True))
			setattr(self, 'dropout_{}'.format(index), nn.Dropout(p=dropout))

	def forward(self, inp, inp_len):
		_, total_length, _ = inp.shape
		for index in range(self.n_layer):
			out = nn.utils.rnn.pack_padded_sequence(inp, batch_first=True, lengths=inp_len, enforce_sorted=False)
			out, _ = getattr(self, 'lstm_{}'.format(index))(out)
			out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0]
			inp = getattr(self, 'dropout_{}'.format(index))(torch.add(out, inp))
		return inp

class GNMT_Encoder_Layer(nn.Module):
	"""
	Google Neural Machine Translation - Encoder
	"""
	def __init__(self, input_size, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(GNMT_Encoder_Layer, self).__init__(**kwargs)
		assert n_layer >= 3

		self.input_size = input_size
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		self.l1_bilstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True, bias=True, bidirectional=True)
		self.l1_dropout = nn.Dropout(p=dropout)
		self.l2_lstm = nn.LSTM(input_size=input_size*2, hidden_size=hidden_size, bias=True)
		self.l2_dropout = nn.Dropout(p=dropout)
		self.res_lstm = Res_LSTM_Layer(n_layer-2, hidden_size, dropout=dropout)

	def forward(self, inp, inp_len):
		batch_size, total_length, _ = inp.shape
		inp = nn.utils.rnn.pack_padded_sequence(inp, batch_first=True, lengths=inp_len, enforce_sorted=False)
		out, (h, c) = self.l1_bilstm(inp)
		backward_hidden_state = h.view(1, 2, batch_size, self.hidden_size)[:,1,:,:].squeeze(0)              # (num_direction, batch_size, enc_hidden_size)
		backward_cell_state = c.view(1, 2, batch_size, self.hidden_size)[:,1,:,:].squeeze(0)                # (num_direction, batch_size, enc_hidden_size)
		out = self.l1_dropout(nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0])
		out = nn.utils.rnn.pack_padded_sequence(out, batch_first=True, lengths=inp_len, enforce_sorted=False)
		out, _ = self.l2_lstm(out)
		out = self.l2_dropout(nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0])
		out = self.res_lstm(out, inp_len)
		return out, backward_hidden_state, backward_cell_state

In [145]:
model = Res_LSTM_Layer(1,128)
inp = torch.ones(5, 10, 128).float()
out = model(inp, np.arange(1,6))

In [146]:
out.shape

torch.Size([5, 10, 128])

In [147]:
model = GNMT_Encoder_Layer(128, 6, 128)
inp = torch.ones(5, 10, 128).float()
out = model(inp, np.arange(1,6))

In [150]:
out[0].shape

torch.Size([5, 10, 128])

In [154]:
out[0].permute(1,0,2)[0].shape

torch.Size([5, 128])

In [177]:
class Additive_Attention_Layer(nn.Module):
	"""
	Additive attention used in GNMT
	"""
	def __init__(self, hidden_size, **kwargs):
		super(Additive_Attention_Layer, self).__init__(**kwargs)
		self.hidden_size = hidden_size

		self.W = nn.Linear(hidden_size*2, hidden_size)
		self.tanh = nn.Tanh()
		self.V = nn.Parameter(torch.Tensor(1, hidden_size))
		self.softmax = nn.Softmax(dim=2)

		nn.init.normal_(self.V, 0, 0.1)

	def forward(self, query, values, mask):
		"""
		: query:  (batch_size, hidden_size)
		: values: (batch_size, seq_len, hidden_size)
		: mask:   (batch_size, seq_len)
		"""
		batch_size, seq_len, hidden_size = values.shape

		query = query.unsqueeze(1).expand(-1, seq_len, -1)
		score = self.tanh(self.W(torch.cat((query, values), dim=2)))                              # (batch_size, seq_len, hidden_size)
		score = torch.bmm(self.V.squeeze(1).expand(batch_size, -1, -1), score.permute(0,2,1))     # (batch_size, 1, seq_len)
		score = self.softmax(torch.add(score, mask.unsqueeze(1)))                                 # (batch_size, 1, seq_len)
		context = torch.bmm(score, values).squeeze(1)                                             # (batch_size, hidden_size)

		return context

In [188]:
class Res_Attn_LSTM_Layer(nn.Module):
	"""
	Multi-layer unidirectional LSTM with residual connection and attention.
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(Res_Attn_LSTM_Layer, self).__init__(**kwargs)
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		for index in range(n_layer):
			setattr(self, 'lstm_{}'.format(index), nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, batch_first=True, bias=True))
			setattr(self, 'dropout_{}'.format(index), nn.Dropout(p=dropout))

	def forward(self, hidden_states, context_vectors, inp_len):
		_, total_length, _ = hidden_states.shape
		for index in range(self.n_layer):
			out = nn.utils.rnn.pack_padded_sequence(torch.cat((hidden_states, context_vectors), dim=2), batch_first=True, lengths=inp_len, enforce_sorted=False)
			out, _ = getattr(self, 'lstm_{}'.format(index))(out)
			out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0]
			hidden_states = getattr(self, 'dropout_{}'.format(index))(torch.add(out, hidden_states))
		return hidden_states

In [197]:
model = Res_Attn_LSTM_Layer(3, 128)
inp = torch.ones(5,10,128).float()
con = torch.ones(5,10,128).float()
out = model(inp, con, np.arange(1,6))

In [198]:
get_torch_module_num_of_parameter(model)

592896

In [201]:
class Additive_Attention_Layer(nn.Module):
	"""
	Additive attention used in GNMT
	"""
	def __init__(self, hidden_size, **kwargs):
		super(Additive_Attention_Layer, self).__init__(**kwargs)
		self.hidden_size = hidden_size

		self.W = nn.Linear(hidden_size*2, hidden_size)
		self.tanh = nn.Tanh()
		self.V = nn.Parameter(torch.Tensor(1, hidden_size))
		self.softmax = nn.Softmax(dim=2)

		nn.init.normal_(self.V, 0, 0.1)

	def forward(self, query, values, mask):
		"""
		: query:  (batch_size, hidden_size)
		: values: (batch_size, seq_len, hidden_size)
		: mask:   (batch_size, seq_len)
		"""
		print(query.shape, values.shape, mask.shape)
		batch_size, seq_len, hidden_size = values.shape

		query = query.unsqueeze(1).expand(-1, seq_len, -1)
		score = self.tanh(self.W(torch.cat((query, values), dim=2)))                              # (batch_size, seq_len, hidden_size)
		score = torch.bmm(self.V.squeeze(1).expand(batch_size, -1, -1), score.permute(0,2,1))     # (batch_size, 1, seq_len)
		score = self.softmax(torch.add(score, mask.unsqueeze(1)))                                 # (batch_size, 1, seq_len)
		context = torch.bmm(score, values).squeeze(1)                                             # (batch_size, hidden_size)

		return context

class Res_Attn_LSTM_Layer(nn.Module):
	"""
	Multi-layer unidirectional LSTM with residual connection and attention.
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(Res_Attn_LSTM_Layer, self).__init__(**kwargs)
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		for index in range(n_layer):
			setattr(self, 'lstm_{}'.format(index), nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, batch_first=True, bias=True))
			setattr(self, 'dropout_{}'.format(index), nn.Dropout(p=dropout))

	def forward(self, hidden_states, context_vectors, inp_len):
		_, total_length, _ = hidden_states.shape
		for index in range(self.n_layer):
			out = nn.utils.rnn.pack_padded_sequence(torch.cat((hidden_states, context_vectors), dim=2), batch_first=True, lengths=inp_len, enforce_sorted=False)
			out, _ = getattr(self, 'lstm_{}'.format(index))(out)
			out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0]
			hidden_states = getattr(self, 'dropout_{}'.format(index))(torch.add(out, hidden_states))
		return hidden_states

class GNMT_Decoder_Layer(nn.Module):
	"""
	Google Neural Machine Translation - Decoder
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, device=torch.device('cpu'), **kwargs):
		super(GNMT_Decoder_Layer, self).__init__(**kwargs)
		assert n_layer>=3

		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout
		self.device = device

		self.attention_calc = Additive_Attention_Layer(hidden_size)
		self.l1_lstm_cell = nn.LSTMCell(input_size=2*hidden_size, hidden_size=hidden_size, bias=True)
		self.l1_dropout = nn.Dropout(p=dropout)
		self.l2_lstm = nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, batch_first=True, bias=True)
		self.l2_dropout = nn.Dropout(p=dropout)
		self.res_attn_lstm = Res_Attn_LSTM_Layer(n_layer-2, hidden_size, dropout=dropout)

	def get_attention_mask(self, inp_len, batch_size, seq_len):
		mask = np.ones((batch_size, seq_len))
		for index, l in enumerate(inp_len):
			mask[index,:l] = 0
		mask *= -1e9
		return torch.from_numpy(mask).float().to(self.device)

	def forward(self, enc_hidden_states, backward_hidden_state, backward_cell_state, inp_len):
		batch_size, seq_len, _ = enc_hidden_states.shape
		attention_mask = self.get_attention_mask(inp_len, batch_size, seq_len)
		enc_hidden_states = enc_hidden_states.permute(1,0,2)                                                                                          # (seq_len, batch_size, hidden_size)
		decoder_hidden_states_buf =  []
		decoder_context_vectors_buf = []
		decoder_h, decoder_c = backward_hidden_state, backward_cell_state
		for step in range(seq_len):
			inp = enc_hidden_states[step]                        
			context_vector = self.attention_calc(inp, enc_hidden_states.permute(1,0,2), attention_mask)                                               # (batch_size, hidden_size)
			decoder_context_vectors_buf.append(context_vector)
			inp = torch.cat((inp, context_vector), dim=1)                                                                                                    # (batch_size, 2*hidden_size)
			decoder_h, decoder_c = self.l1_lstm_cell(inp, (decoder_c, decoder_h))
			decoder_hidden_states_buf.append(decoder_h)
		decoder_context_vectors = torch.stack(decoder_context_vectors_buf, dim=1)                                                                     # (batch_size, seq_len, hidden_size)
		decoder_hidden_states = torch.stack(decoder_hidden_states_buf, dim=1)                                                                         # (batch_size, seq_len, hidden_size)
		decoder_hidden_states = self.l1_dropout(torch.cat((decoder_hidden_states, decoder_context_vectors), dim=2))                                   # (batch_size, seq_len, 2*hidden_size)
		decoder_hidden_states = nn.utils.rnn.pack_padded_sequence(decoder_hidden_states, batch_first=True, lengths=inp_len, enforce_sorted=False)
		decoder_hidden_states, _ = self.l2_lstm(decoder_hidden_states)
		decoder_hidden_states = nn.utils.rnn.pad_packed_sequence(decoder_hidden_states, batch_first=True, total_length=seq_len)[0]
		decoder_hidden_states = self.l2_dropout(decoder_hidden_states)
		decoder_hidden_states = self.res_attn_lstm(decoder_hidden_states, decoder_context_vectors, inp_len)                                                    # (batch_size, seq_len, hidden_size)
		return decoder_hidden_states

In [202]:
model = GNMT_Decoder_Layer(6, 128)
inp = torch.ones(5, 10, 128).float()
h = torch.ones(5,128).float()
c = torch.ones(5,128).float()
out = model(inp, h, c, np.arange(1,6))

torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])
torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])
torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])
torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])
torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])
torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])
torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])
torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])
torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])
torch.Size([5, 128]) torch.Size([5, 10, 128]) torch.Size([5, 10])


In [203]:
out.shape

torch.Size([5, 10, 128])

In [204]:
get_torch_module_num_of_parameter(model)

1218816

In [212]:
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

class Res_LSTM_Layer(nn.Module):
	"""
	Multi-layer unidirectional LSTM with residual connection.
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(Res_LSTM_Layer, self).__init__(**kwargs)
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		for index in range(n_layer):
			setattr(self, 'lstm_{}'.format(index), nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True, bias=True))
			setattr(self, 'dropout_{}'.format(index), nn.Dropout(p=dropout))

	def forward(self, inp, inp_len):
		_, total_length, _ = inp.shape
		for index in range(self.n_layer):
			out = nn.utils.rnn.pack_padded_sequence(inp, batch_first=True, lengths=inp_len, enforce_sorted=False)
			out, _ = getattr(self, 'lstm_{}'.format(index))(out)
			out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0]
			inp = getattr(self, 'dropout_{}'.format(index))(torch.add(out, inp))
		return inp

class GNMT_Encoder_Layer(nn.Module):
	"""
	Google Neural Machine Translation - Encoder
	"""
	def __init__(self, input_size, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(GNMT_Encoder_Layer, self).__init__(**kwargs)
		assert n_layer >= 3

		self.input_size = input_size
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		self.l1_bilstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True, bias=True, bidirectional=True)
		self.l1_dropout = nn.Dropout(p=dropout)
		self.l2_lstm = nn.LSTM(input_size=hidden_size*2, hidden_size=hidden_size, bias=True)
		self.l2_dropout = nn.Dropout(p=dropout)
		self.res_lstm = Res_LSTM_Layer(n_layer-2, hidden_size, dropout=dropout)

	def forward(self, inp, inp_len):
		batch_size, total_length, _ = inp.shape
		inp = nn.utils.rnn.pack_padded_sequence(inp, batch_first=True, lengths=inp_len, enforce_sorted=False)
		out, (h, c) = self.l1_bilstm(inp)
		backward_hidden_state = h.view(1, 2, batch_size, self.hidden_size)[:,1,:,:].squeeze(0)              # (num_direction, batch_size, enc_hidden_size)
		backward_cell_state = c.view(1, 2, batch_size, self.hidden_size)[:,1,:,:].squeeze(0)                # (num_direction, batch_size, enc_hidden_size)
		out = self.l1_dropout(nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0])
		out = nn.utils.rnn.pack_padded_sequence(out, batch_first=True, lengths=inp_len, enforce_sorted=False)
		out, _ = self.l2_lstm(out)
		out = self.l2_dropout(nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0])
		out = self.res_lstm(out, inp_len)
		return out, backward_hidden_state, backward_cell_state

class Additive_Attention_Layer(nn.Module):
	"""
	Additive attention used in GNMT
	"""
	def __init__(self, hidden_size, **kwargs):
		super(Additive_Attention_Layer, self).__init__(**kwargs)
		self.hidden_size = hidden_size

		self.W = nn.Linear(hidden_size*2, hidden_size)
		self.tanh = nn.Tanh()
		self.V = nn.Parameter(torch.Tensor(1, hidden_size))
		self.softmax = nn.Softmax(dim=2)

		nn.init.normal_(self.V, 0, 0.1)

	def forward(self, query, values, mask):
		"""
		: query:  (batch_size, hidden_size)
		: values: (batch_size, seq_len, hidden_size)
		: mask:   (batch_size, seq_len)
		"""
		batch_size, seq_len, hidden_size = values.shape

		query = query.unsqueeze(1).expand(-1, seq_len, -1)
		score = self.tanh(self.W(torch.cat((query, values), dim=2)))                              # (batch_size, seq_len, hidden_size)
		score = torch.bmm(self.V.squeeze(1).expand(batch_size, -1, -1), score.permute(0,2,1))     # (batch_size, 1, seq_len)
		score = self.softmax(torch.add(score, mask.unsqueeze(1)))                                 # (batch_size, 1, seq_len)
		context = torch.bmm(score, values).squeeze(1)                                             # (batch_size, hidden_size)

		return context

class Res_Attn_LSTM_Layer(nn.Module):
	"""
	Multi-layer unidirectional LSTM with residual connection and attention.
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(Res_Attn_LSTM_Layer, self).__init__(**kwargs)
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		for index in range(n_layer):
			setattr(self, 'lstm_{}'.format(index), nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, batch_first=True, bias=True))
			setattr(self, 'dropout_{}'.format(index), nn.Dropout(p=dropout))

	def forward(self, hidden_states, context_vectors, inp_len):
		_, total_length, _ = hidden_states.shape
		for index in range(self.n_layer):
			out = nn.utils.rnn.pack_padded_sequence(torch.cat((hidden_states, context_vectors), dim=2), batch_first=True, lengths=inp_len, enforce_sorted=False)
			out, _ = getattr(self, 'lstm_{}'.format(index))(out)
			out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0]
			hidden_states = getattr(self, 'dropout_{}'.format(index))(torch.add(out, hidden_states))
		return hidden_states

class GNMT_Decoder_Layer(nn.Module):
	"""
	Google Neural Machine Translation - Decoder
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, device=None, **kwargs):
		super(GNMT_Decoder_Layer, self).__init__(**kwargs)
		assert n_layer>=3

		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout
		self.device = device if device else torch.device('cpu')

		self.attention_calc = Additive_Attention_Layer(hidden_size)
		self.l1_lstm_cell = nn.LSTMCell(input_size=2*hidden_size, hidden_size=hidden_size, bias=True)
		self.l1_dropout = nn.Dropout(p=dropout)
		self.l2_lstm = nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, batch_first=True, bias=True)
		self.l2_dropout = nn.Dropout(p=dropout)
		self.res_attn_lstm = Res_Attn_LSTM_Layer(n_layer-2, hidden_size, dropout=dropout)

	def get_attention_mask(self, inp_len, batch_size, seq_len):
		mask = np.ones((batch_size, seq_len))
		for index, l in enumerate(inp_len):
			mask[index,:l] = 0
		mask *= -1e9
		return torch.from_numpy(mask).float().to(self.device)

	def forward(self, enc_hidden_states, backward_hidden_state, backward_cell_state, inp_len):
		batch_size, seq_len, _ = enc_hidden_states.shape
		attention_mask = self.get_attention_mask(inp_len, batch_size, seq_len)
		enc_hidden_states = enc_hidden_states.permute(1,0,2)                                                                                          # (seq_len, batch_size, hidden_size)
		decoder_hidden_states_buf =  []
		decoder_context_vectors_buf = []
		decoder_h, decoder_c = backward_hidden_state, backward_cell_state
		for step in range(seq_len):
			inp = enc_hidden_states[step]                        
			context_vector = self.attention_calc(inp, enc_hidden_states.permute(1,0,2), attention_mask)                                               # (batch_size, hidden_size)
			decoder_context_vectors_buf.append(context_vector)
			inp = torch.cat((inp, context_vector), dim=1)                                                                                                    # (batch_size, 2*hidden_size)
			decoder_h, decoder_c = self.l1_lstm_cell(inp, (decoder_c, decoder_h))
			decoder_hidden_states_buf.append(decoder_h)
		decoder_context_vectors = torch.stack(decoder_context_vectors_buf, dim=1)                                                                     # (batch_size, seq_len, hidden_size)
		decoder_hidden_states = torch.stack(decoder_hidden_states_buf, dim=1)                                                                         # (batch_size, seq_len, hidden_size)
		decoder_hidden_states = self.l1_dropout(torch.cat((decoder_hidden_states, decoder_context_vectors), dim=2))                                   # (batch_size, seq_len, 2*hidden_size)
		decoder_hidden_states = nn.utils.rnn.pack_padded_sequence(decoder_hidden_states, batch_first=True, lengths=inp_len, enforce_sorted=False)
		decoder_hidden_states, _ = self.l2_lstm(decoder_hidden_states)
		decoder_hidden_states = nn.utils.rnn.pad_packed_sequence(decoder_hidden_states, batch_first=True, total_length=seq_len)[0]
		decoder_hidden_states = self.l2_dropout(decoder_hidden_states)
		decoder_hidden_states = self.res_attn_lstm(decoder_hidden_states, decoder_context_vectors, inp_len)                                                    # (batch_size, seq_len, hidden_size)
		return decoder_hidden_states

class GNMT_Extraction_Layer(nn.Module):
	"""
	Seq2Seq feature extration layer based on Google Neural Machine Translation.
	"""
	def __init__(self, embed_size, hidden_size, n_enc_layer, n_dec_layer, device=None, dropout=0.1, **kwargs):
		super(GNMT_Extraction_Layer, self).__init__(**kwargs)
		self.embed_size = embed_size
		self.hidden_size = hidden_size
		self.n_enc_layer = n_enc_layer
		self.n_dec_layer = n_dec_layer
		self.device = device if device else torch.device('cpu')
		self.dropout = dropout

		self.encoder = GNMT_Encoder_Layer(embed_size, n_enc_layer, hidden_size)
		self.decoder = GNMT_Decoder_Layer(n_dec_layer, hidden_size, device=self.device)

	def forward(self, inp, inp_len):
		encoder_hidden_states, backward_hidden_state, backward_cell_state = self.encoder(inp, inp_len)
		decoder_hidden_states = self.decoder(encoder_hidden_states, backward_hidden_state, backward_cell_state, inp_len)
		return decoder_hidden_states

In [216]:
model = GNMT_Extraction_Layer(128, 128, 8,8)
inp = torch.ones(5, 10, 128).float()
out = model(inp, np.arange(1,6))

In [217]:
out.shape

torch.Size([5, 10, 128])

In [218]:
get_torch_module_num_of_parameter(model)

2868480

In [236]:
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

class Res_LSTM_Layer(nn.Module):
	"""
	Multi-layer unidirectional LSTM with residual connection.
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(Res_LSTM_Layer, self).__init__(**kwargs)
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		for index in range(n_layer):
			setattr(self, 'lstm_{}'.format(index), nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True, bias=True))
			setattr(self, 'dropout_{}'.format(index), nn.Dropout(p=dropout))

	def forward(self, inp, inp_len):
		_, total_length, _ = inp.shape
		for index in range(self.n_layer):
			out = nn.utils.rnn.pack_padded_sequence(inp, batch_first=True, lengths=inp_len, enforce_sorted=False)
			out, _ = getattr(self, 'lstm_{}'.format(index))(out)
			out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0]
			inp = getattr(self, 'dropout_{}'.format(index))(torch.add(out, inp))
		return inp

class GNMT_Encoder_Layer(nn.Module):
	"""
	Google Neural Machine Translation - Encoder
	"""
	def __init__(self, input_size, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(GNMT_Encoder_Layer, self).__init__(**kwargs)
		assert n_layer >= 3

		self.input_size = input_size
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		self.l1_bilstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True, bias=True, bidirectional=True)
		self.l1_dropout = nn.Dropout(p=dropout)
		self.l2_lstm = nn.LSTM(input_size=hidden_size*2, hidden_size=hidden_size, bias=True)
		self.l2_dropout = nn.Dropout(p=dropout)
		self.res_lstm = Res_LSTM_Layer(n_layer-2, hidden_size, dropout=dropout)

	def forward(self, inp, inp_len):
		batch_size, total_length, _ = inp.shape
		inp = nn.utils.rnn.pack_padded_sequence(inp, batch_first=True, lengths=inp_len, enforce_sorted=False)
		out, (h, c) = self.l1_bilstm(inp)
		backward_hidden_state = h.view(1, 2, batch_size, self.hidden_size)[:,1,:,:].squeeze(0)              # (num_direction, batch_size, enc_hidden_size)
		backward_cell_state = c.view(1, 2, batch_size, self.hidden_size)[:,1,:,:].squeeze(0)                # (num_direction, batch_size, enc_hidden_size)
		out = self.l1_dropout(nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0])
		out = nn.utils.rnn.pack_padded_sequence(out, batch_first=True, lengths=inp_len, enforce_sorted=False)
		out, _ = self.l2_lstm(out)
		out = self.l2_dropout(nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0])
		out = self.res_lstm(out, inp_len)
		return out, backward_hidden_state, backward_cell_state

class Additive_Attention_Layer(nn.Module):
	"""
	Additive attention used in GNMT
	"""
	def __init__(self, hidden_size, **kwargs):
		super(Additive_Attention_Layer, self).__init__(**kwargs)
		self.hidden_size = hidden_size

		self.W = nn.Linear(hidden_size*2, hidden_size)
		self.tanh = nn.Tanh()
		self.V = nn.Parameter(torch.Tensor(1, hidden_size))
		self.softmax = nn.Softmax(dim=2)

		nn.init.normal_(self.V, 0, 0.1)

	def forward(self, query, values, mask):
		"""
		: query:  (batch_size, hidden_size)
		: values: (batch_size, seq_len, hidden_size)
		: mask:   (batch_size, seq_len)
		"""
		batch_size, seq_len, hidden_size = values.shape

		query = query.unsqueeze(1).expand(-1, seq_len, -1)
		score = self.tanh(self.W(torch.cat((query, values), dim=2)))                              # (batch_size, seq_len, hidden_size)
		score = torch.bmm(self.V.squeeze(1).expand(batch_size, -1, -1), score.permute(0,2,1))     # (batch_size, 1, seq_len)
		score = self.softmax(torch.add(score, mask.unsqueeze(1)))                                 # (batch_size, 1, seq_len)
		context = torch.bmm(score, values).squeeze(1)                                             # (batch_size, hidden_size)

		return context

class Res_Attn_LSTM_Layer(nn.Module):
	"""
	Multi-layer unidirectional LSTM with residual connection and attention.
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, **kwargs):
		super(Res_Attn_LSTM_Layer, self).__init__(**kwargs)
		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout

		for index in range(n_layer):
			setattr(self, 'lstm_{}'.format(index), nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, batch_first=True, bias=True))
			setattr(self, 'dropout_{}'.format(index), nn.Dropout(p=dropout))

	def forward(self, hidden_states, context_vectors, inp_len):
		_, total_length, _ = hidden_states.shape
		for index in range(self.n_layer):
			out = nn.utils.rnn.pack_padded_sequence(torch.cat((hidden_states, context_vectors), dim=2), batch_first=True, lengths=inp_len, enforce_sorted=False)
			out, _ = getattr(self, 'lstm_{}'.format(index))(out)
			out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=total_length)[0]
			hidden_states = getattr(self, 'dropout_{}'.format(index))(torch.add(out, hidden_states))
		return hidden_states

class GNMT_Decoder_Layer(nn.Module):
	"""
	Google Neural Machine Translation - Decoder
	"""
	def __init__(self, n_layer, hidden_size, dropout=0.1, device=None, **kwargs):
		super(GNMT_Decoder_Layer, self).__init__(**kwargs)
		assert n_layer>=3

		self.n_layer = n_layer
		self.hidden_size = hidden_size
		self.dropout = dropout
		self.device = device if device else torch.device('cpu')

		self.attention_calc = Additive_Attention_Layer(hidden_size)
		self.l1_lstm_cell = nn.LSTMCell(input_size=2*hidden_size, hidden_size=hidden_size, bias=True)
		self.l1_dropout = nn.Dropout(p=dropout)
		self.l2_lstm = nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, batch_first=True, bias=True)
		self.l2_dropout = nn.Dropout(p=dropout)
		self.res_attn_lstm = Res_Attn_LSTM_Layer(n_layer-2, hidden_size, dropout=dropout)

	def get_attention_mask(self, inp_len, batch_size, seq_len):
		mask = np.ones((batch_size, seq_len))
		for index, l in enumerate(inp_len):
			mask[index,:l] = 0
		mask *= -1e9
		return torch.from_numpy(mask).float().to(self.device)

	def forward(self, enc_hidden_states, backward_hidden_state, backward_cell_state, inp_len):
		batch_size, seq_len, _ = enc_hidden_states.shape
		attention_mask = self.get_attention_mask(inp_len, batch_size, seq_len)
		enc_hidden_states = enc_hidden_states.permute(1,0,2)                                                                                          # (seq_len, batch_size, hidden_size)
		decoder_hidden_states_buf =  []
		decoder_context_vectors_buf = []
		decoder_h, decoder_c = backward_hidden_state, backward_cell_state
		for step in range(seq_len):
			inp = enc_hidden_states[step]                        
			context_vector = self.attention_calc(inp, enc_hidden_states.permute(1,0,2), attention_mask)                                               # (batch_size, hidden_size)
			decoder_context_vectors_buf.append(context_vector)
			inp = torch.cat((inp, context_vector), dim=1)                                                                                                    # (batch_size, 2*hidden_size)
			decoder_h, decoder_c = self.l1_lstm_cell(inp, (decoder_c, decoder_h))
			decoder_hidden_states_buf.append(decoder_h)
		decoder_context_vectors = torch.stack(decoder_context_vectors_buf, dim=1)                                                                     # (batch_size, seq_len, hidden_size)
		decoder_hidden_states = torch.stack(decoder_hidden_states_buf, dim=1)                                                                         # (batch_size, seq_len, hidden_size)
		decoder_hidden_states = self.l1_dropout(torch.cat((decoder_hidden_states, decoder_context_vectors), dim=2))                                   # (batch_size, seq_len, 2*hidden_size)
		decoder_hidden_states = nn.utils.rnn.pack_padded_sequence(decoder_hidden_states, batch_first=True, lengths=inp_len, enforce_sorted=False)
		decoder_hidden_states, _ = self.l2_lstm(decoder_hidden_states)
		decoder_hidden_states = nn.utils.rnn.pad_packed_sequence(decoder_hidden_states, batch_first=True, total_length=seq_len)[0]
		decoder_hidden_states = self.l2_dropout(decoder_hidden_states)
		decoder_hidden_states = self.res_attn_lstm(decoder_hidden_states, decoder_context_vectors, inp_len)                                                    # (batch_size, seq_len, hidden_size)
		return decoder_hidden_states

class GNMT_Extraction_Layer(nn.Module):
	"""
	Seq2Seq feature extration layer based on Google Neural Machine Translation.
	"""
	def __init__(self, embed_size, hidden_size, n_enc_layer, n_dec_layer, device=None, dropout=0.1, **kwargs):
		super(GNMT_Extraction_Layer, self).__init__(**kwargs)
		self.embed_size = embed_size
		self.hidden_size = hidden_size
		self.n_enc_layer = n_enc_layer
		self.n_dec_layer = n_dec_layer
		self.device = device if device else torch.device('cpu')
		self.dropout = dropout

		self.encoder = GNMT_Encoder_Layer(embed_size, n_enc_layer, hidden_size)
		self.decoder = GNMT_Decoder_Layer(n_dec_layer, hidden_size, device=self.device)

	def forward(self, inp, inp_len):
		encoder_hidden_states, backward_hidden_state, backward_cell_state = self.encoder(inp, inp_len)
		decoder_hidden_states = self.decoder(encoder_hidden_states, backward_hidden_state, backward_cell_state, inp_len)
		return decoder_hidden_states

class MLP_Classification_Layer(nn.Module):
	"""
	Multilayer Perception Classification Layer
	- Layer 1: Linear + Batchnorm + ReLU + Dropout
	- Layer 2: Linear + Batchnorm + ReLU + Dropout
	- Layer 3: Linear
	"""
	def __init__(self, inp_size, out_size, dropout=0.4, **kwargs):
		super(MLP_Classification_Layer, self).__init__(**kwargs)
		self.inp_size = inp_size
		self.out_size = out_size
		self.dropout = dropout
		
		self.mlp_1 = nn.Linear(inp_size, 1024)
		self.batchnorm_1 = nn.BatchNorm1d(1024)
		self.mlp_dropout_1 = nn.Dropout(p=dropout)
		self.mlp_2 = nn.Linear(1024, 512)
		self.batchnorm_2 = nn.BatchNorm1d(512)
		self.mlp_dropout_2 = nn.Dropout(p=dropout)
		self.mlp_3 = nn.Linear(512, out_size)
		
	def forward(self, inp):
		mlp_out = self.mlp_1(inp)                                                         # (batch_size, 1024)
		mlp_out = self.mlp_dropout_1(F.relu(self.batchnorm_1(mlp_out)))                   # (batch_size, 1024)
		mlp_out = self.mlp_2(mlp_out)                                                     # (batch_size, 512)
		mlp_out = self.mlp_dropout_2(F.relu(self.batchnorm_2(mlp_out)))                   # (batch_size, 512)
		mlp_out = self.mlp_3(mlp_out)                                                     # (batch_size, out_size)
		return mlp_out   

class GNMT_Classifier(nn.Module):
	"""
	Use GNMT for Seq2Seq feature extraction, apply max pooling & pick last state, then use multilayer perception for classification
	"""
	def __init__(self, out_size, embed_size, hidden_size, n_enc_layer, n_dec_layer, device=None, rnn_dropout=0.1, dnn_dropout=0.4, **kwargs):
		super(GNMT_Classifier, self).__init__()
		self.out_size = out_size
		self.embed_size = embed_size
		self.hidden_size = hidden_size
		self.n_enc_layer = n_enc_layer
		self.n_dec_layer = n_dec_layer
		self.device = device if device else torch.device('cpu')
		self.rnn_dropout = rnn_dropout
		self.dnn_dropout = dnn_dropout

		self.GNMT_layer = GNMT_Extraction_Layer(embed_size, hidden_size, n_enc_layer, n_dec_layer, device=device, dropout=rnn_dropout)
		self.mlp_layer = MLP_Classification_Layer(hidden_size*2, out_size, dropout=dnn_dropout)

	def forward(self, inp, inp_len):
		inp = self.GNMT_layer(inp, inp_len)                                                        # (batch_size, seq_len, hidden_size)
		max_pool_buf, last_buf = [], []
		for batch_idx, l in enumerate(inp_len):
			max_pool_buf.append(torch.max(inp[batch_idx,:l], dim=0)[0])
			last_buf.append(inp[batch_idx, l-1])
		out = torch.cat((torch.stack(max_pool_buf, dim=0), torch.stack(last_buf, dim=0)), dim=1)   # (batch_size, hidden_size*2)
		out = self.mlp_layer(out)                                                                 # (batch_size, out_size)
		return out

In [243]:
model = GNMT_Classifier(10,256,256,8,8)
inp = torch.ones(5, 10, 256).float()
out = model(inp, np.arange(1,6))

In [244]:
get_torch_module_num_of_parameter(model)

12496906

## Transformer Classifier

In [None]:
from transformers.modeling_bert import BertConfig, BertEncoder, BertAttention,BertSelfAttention,BertLayer,BertPooler

In [22]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class Positional_Encoding_Layer(nn.Module):
	"""
	Positional encoding using sine and cosine as described in "Attention is all you need".
	"""
	def __init__(self, d_model, max_seq_len=512, dropout=0.1):
		"""
		Formula:
		| PE(pos,2i) = sin(pos/10000**(2*i/d_model))
		| PE(pos,2i+1) = cos(pos/10000**(2*i/d_model))
		"""
		super(Positional_Encoding_Layer, self).__init__()
		self.d_model = d_model
		self.dropout = dropout
		self.max_seq_len = max_seq_len

		self.dropout_layer = nn.Dropout(p=dropout)
		pe = torch.zeros(max_seq_len, d_model)                                                       # (max_seq_len, d_model)
		position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)                      # (max_seq_len, 1)
		div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))     # (d_model/2)
		pe[:, 0::2] = torch.sin(position * div_term)
		pe[:, 1::2] = torch.cos(position * div_term) 
		pe = pe.unsqueeze(0).transpose(0, 1)                                                         # (max_seq_len, 1, d_model)
		self.register_buffer('pe', pe)

	def forward(self, inp):
		inp = inp + self.pe[:inp.size(0), :]                                                         # (n_step, batch_size, d_model)
		return self.dropout_layer(inp)

class Transformer_Encoder_Extraction_Layer(nn.Module):
	"""
	Transformer encoder as described in "Attention is all you need".
	"""
	def __init__(self, n_enc_layer, embed_size, n_head, intermediate_size, max_seq_len=512, dropout=0.1, **kwargs):
		super(Transformer_Encoder_Extraction_Layer, self).__init__(**kwargs)
		assert embed_size%n_head==0

		self.n_enc_layer = n_enc_layer
		self.embed_size = embed_size
		self.max_seq_len = max_seq_len
		self.n_head = n_head
		self.intermediate_size = intermediate_size
		self.dropout = dropout

		self.positional_encoder = Positional_Encoding_Layer(embed_size, max_seq_len=max_seq_len)
		transformer_encoder_layer = TransformerEncoderLayer(embed_size, n_head, dim_feedforward=intermediate_size, dropout=dropout)
		self.transformer_encoder = TransformerEncoder(transformer_encoder_layer, n_enc_layer)

		self._init_weights()

	def _init_weights(self):
		for p in self.parameters():
			if p.dim() > 1:
				xavier_uniform_(p)

	def forward(self, inp, inp_padding_mask=None):
		inp = inp * np.sqrt(self.embed_size)                                           # (batch_size, n_step, embed_size)
		inp = self.positional_encoder(inp.permute(1, 0, 2))                            # (n_step, batch_size, embed_size)
		out = self.transformer_encoder(inp, src_key_padding_mask=inp_padding_mask)     # (n_step, batch_size, embed_size)
		return out.permute(1, 0, 2)

class MLP_Classification_Layer(nn.Module):
	"""
	Multilayer Perception Classification Layer
	- Layer 1: Linear + Batchnorm + ReLU + Dropout
	- Layer 2: Linear + Batchnorm + ReLU + Dropout
	- Layer 3: Linear
	"""
	def __init__(self, inp_size, out_size, dropout=0.4, **kwargs):
		super(MLP_Classification_Layer, self).__init__(**kwargs)
		self.inp_size = inp_size
		self.out_size = out_size
		self.dropout = dropout
		
		self.mlp_1 = nn.Linear(inp_size, 1024)
		self.batchnorm_1 = nn.BatchNorm1d(1024)
		self.mlp_dropout_1 = nn.Dropout(p=dropout)
		self.mlp_2 = nn.Linear(1024, 512)
		self.batchnorm_2 = nn.BatchNorm1d(512)
		self.mlp_dropout_2 = nn.Dropout(p=dropout)
		self.mlp_3 = nn.Linear(512, out_size)
	
	def _init_weights(self):
		initrange = 0.1
		self.mlp_1.weight.data.uniform_(-initrange, initrange)
		self.mlp_1.bias.data.zero_()
		self.mlp_2.weight.data.uniform_(-initrange, initrange)
		self.mlp_2.bias.data.zero_()
		self.mlp_3.weight.data.uniform_(-initrange, initrange)
		self.mlp_3.bias.data.zero_()

	def forward(self, inp):
		mlp_out = self.mlp_1(inp)                                                         # (batch_size, 1024)
		mlp_out = self.mlp_dropout_1(F.relu(self.batchnorm_1(mlp_out)))                   # (batch_size, 1024)
		mlp_out = self.mlp_2(mlp_out)                                                     # (batch_size, 512)
		mlp_out = self.mlp_dropout_2(F.relu(self.batchnorm_2(mlp_out)))                   # (batch_size, 512)
		mlp_out = self.mlp_3(mlp_out)                                                     # (batch_size, out_size)
		return mlp_out   
	
class Transformer_Encoder_Classifier(nn.Module):
	"""
	Transformer Encoder + Multilayer Perception for Classification
	"""
	def __init__(self, embed_size, out_size, n_enc_layer, n_head, intermediate_size, device, transformer_dropout=0.1, mlp_dropout=0.4, **kwargs):
		super(Transformer_Encoder_Classifier, self).__init__(**kwargs)
		
		self.embed_size = embed_size
		self.out_size = out_size
		self.n_enc_layer = n_enc_layer
		self.n_head = n_head
		self.intermediate_size = intermediate_size
		self.device = device
		self.transformer_dropout = transformer_dropout
		self.mlp_dropout = mlp_dropout

		self.encoder_layer = Transformer_Encoder_Extraction_Layer(n_enc_layer, embed_size, n_head, intermediate_size, dropout=transformer_dropout)
		self.classification_layer = MLP_Classification_Layer(embed_size, out_size, dropout=mlp_dropout)

	def get_padding_mask(self, batch_size, seq_len, inp_last_idx):
		padding_mask = np.ones((batch_size, seq_len))
		for index, last_idx in enumerate(inp_last_idx):
			padding_mask[index,:last_idx+1] = 0
		return torch.from_numpy(padding_mask).bool().to(self.device)

	def forward(self, inp_embed, inp_last_idx):
		assert inp_embed.shape[0] == inp_last_idx.shape[0]
		batch_size = inp_embed.shape[0]
		seq_len = inp_embed.shape[1]
		inp_padding_mask = self.get_padding_mask(batch_size, seq_len, inp_last_idx)
		out = self.encoder_layer(inp_embed, inp_padding_mask=inp_padding_mask)               # (batch_size, n_step, embed_size)
		pooled_buf = []
		for index, last_idx in enumerate(inp_last_idx):
			pooled_buf.append(torch.max(out[index,:last_idx+1,:], dim=0)[0])
		out = torch.stack(pooled_buf)                                                        # (batch_size, embed_size)
		out = self.classification_layer(out)                                                 # (batch_size, out_size)
		return out

In [23]:
model = Transformer_Encoder_Classifier(256, 2, 6, 8, 2048, torch.device('cpu'))

In [24]:
p = np.random.standard_normal((100,6,256))
inp = torch.from_numpy(np.concatenate([p, np.zeros((100, 4, 256))], axis=1)).float()
inp_last_idx = np.array([5 for _ in range(100)])

In [25]:
model(inp, inp_last_idx)

tensor([[-2.4194e-01,  2.8054e-01],
        [ 2.4362e-02,  8.0539e-01],
        [-9.2631e-01,  2.7887e-01],
        [-8.7398e-01, -2.1272e-01],
        [-5.1911e-01,  6.4198e-01],
        [ 3.7743e-01,  4.2349e-01],
        [ 3.1638e-02,  1.1861e-01],
        [-3.1806e-01,  2.7993e-01],
        [ 8.0730e-01, -7.3080e-01],
        [-5.1810e-01, -3.3092e-01],
        [-1.1688e-02,  3.2414e-01],
        [-1.6236e-01, -5.7130e-01],
        [ 5.4187e-01, -7.0140e-01],
        [-1.1773e+00, -8.6084e-02],
        [-2.9260e-01,  1.8474e-01],
        [ 5.6447e-01,  3.4316e-01],
        [-1.6931e-01,  6.9761e-01],
        [-5.2122e-01, -5.3940e-01],
        [ 1.4284e+00, -3.9874e-02],
        [-5.8163e-01, -1.2760e-01],
        [ 1.0157e+00, -6.7378e-01],
        [ 9.3978e-01, -4.7628e-01],
        [ 2.1258e-01,  1.0124e+00],
        [ 6.5808e-01, -3.0539e-02],
        [ 3.1233e-01,  4.3065e-01],
        [ 7.2707e-01,  1.5000e-01],
        [ 4.3200e-01, -3.0156e-01],
        [ 2.6709e-01,  6.545

In [None]:
torch.ones(1,10).shape

In [218]:
torch.ones(1,10).squeeze(0).shape

torch.Size([10])

In [211]:
torch.stack([torch.mean(torch.ones(1,10,20), dim=1).squeeze(0), torch.mean(torch.ones(1,10,20), dim=1).squeeze(0)]).shape

torch.Size([2, 20])

## Transformer

In [8]:
torch.arange(0, 10, dtype=torch.float).unsqueeze(1).shape

torch.Size([10, 1])

In [14]:
d_model = 13
torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)).shape

torch.Size([7])

In [179]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class Positional_Encoding_Layer(nn.Module):
	"""
	Positional encoding using sine and cosine as described in "Attention is all you need".
	"""
	def __init__(self, d_model, max_seq_len=128, dropout=0.1):
		"""
		Formula:
		| PE(pos,2i) = sin(pos/10000**(2*i/d_model))
		| PE(pos,2i+1) = cos(pos/10000**(2*i/d_model))
		"""
		super(Positional_Encoding_Layer, self).__init__()
		self.d_model = d_model
		self.dropout = dropout
		self.max_seq_len = max_seq_len

		self.dropout_layer = nn.Dropout(p=dropout)
		pe = torch.zeros(max_seq_len, d_model)                                                       # (max_seq_len, d_model)
		position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)                      # (max_seq_len, 1)
		div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))     # (d_model/2)
		pe[:, 0::2] = torch.sin(position * div_term)
		pe[:, 1::2] = torch.cos(position * div_term) 
		pe = pe.unsqueeze(0).transpose(0, 1)                                                         # (max_seq_len, 1, d_model)
		self.register_buffer('pe', pe)

	def forward(self, inp):
		inp = inp + self.pe[:inp.size(0), :]                                                         # (n_step, batch_size, d_model)
		return self.dropout_layer(inp)

class Transformer_Encoder_Extraction_Layer(nn.Module):
	"""
	Transformer encoder as described in "Attention is all you need".
	"""
	def __init__(self, n_enc_layer, embed_size, n_head, intermediate_size, max_seq_len=512, dropout=0.1, **kwargs):
		super(Transformer_Encoder_Extraction_Layer, self).__init__(**kwargs)
		assert embed_size%n_head==0

		self.n_enc_layer = n_enc_layer
		self.embed_size = embed_size
		self.max_seq_len = max_seq_len
		self.n_head = n_head
		self.intermediate_size = intermediate_size
		self.dropout = dropout

		self.positional_encoder = Positional_Encoding_Layer(embed_size, max_seq_len=max_seq_len)
		transformer_encoder_layer = TransformerEncoderLayer(embed_size, n_head, dim_feedforward=intermediate_size, dropout=dropout)
		self.transformer_encoder = TransformerEncoder(transformer_encoder_layer, n_enc_layer)

		self._init_weights()

	def _init_weights(self):
		for p in self.parameters():
			if p.dim() > 1:
				xavier_uniform_(p)

	def forward(self, inp, inp_padding_mask=None):
		inp = inp * np.sqrt(self.embed_size)                                           # (batch_size, n_step, embed_size)
		inp = self.positional_encoder(inp.permute(1, 0, 2))                            # (n_step, batch_size, embed_size)
		out = self.transformer_encoder(inp, src_key_padding_mask=inp_padding_mask)     # (n_step, batch_size, embed_size)
		return out.permute(1, 0, 2)

In [180]:
def get_torch_module_num_of_parameter(model):
	"""
	Get # of parameters in a torch module.
	"""
	model_parameters = filter(lambda p: p.requires_grad, model.parameters())
	params = sum([np.prod(p.size()) for p in model_parameters])
	return params

In [183]:
model = Transformer_Encoder_Extraction_Layer(6, 256, 8, 2048)
get_torch_module_num_of_parameter(model)

7890432

In [184]:
p = np.random.standard_normal((100,6,256))
inp_1 = torch.from_numpy(np.concatenate([p, np.zeros((100, 4, 256))], axis=1)).float()
inp_2 = torch.from_numpy(np.concatenate([p, np.ones((100, 4, 256))], axis=1)).float()
pad_mask = torch.from_numpy(np.concatenate([np.zeros((100, 6)), np.ones((100,4))], axis=1)).bool()

In [185]:
def get_padding_mask(batch_size, seq_len, inp_last_idx):
    padding_mask = np.ones((batch_size, seq_len))
    for index, last_idx in enumerate(inp_last_idx):
        padding_mask[index,:last_idx+1] = 0
    return torch.from_numpy(padding_mask).bool()

In [186]:
get_padding_mask(3,10,[5,6,7])

tensor([[False, False, False, False, False, False,  True,  True,  True,  True],
        [False, False, False, False, False, False, False,  True,  True,  True],
        [False, False, False, False, False, False, False, False,  True,  True]])

In [187]:
a

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [188]:
model.eval()
out_1 = model(inp_1, inp_padding_mask=pad_mask)
out_2 = model(inp_2, inp_padding_mask=pad_mask)

## BERT

In [150]:
from transformers import BertModel, BertConfig

In [151]:
configuration = BertConfig(vocab_size=1, hidden_size=4, num_hidden_layers=8, num_attention_heads=2, intermediate_size=2048)
model = BertModel(configuration)

In [152]:
inp = torch.from_numpy(np.concatenate([np.random.standard_normal((1, 5, 4)), np.zeros((1, 5, 4))], axis=1)).float()
mask = torch.from_numpy(np.concatenate([np.zeros((1, 5)), np.zeros((1, 5))], axis=1)).float()

In [153]:
inp

tensor([[[-0.4829,  0.3867,  1.2085,  1.0879],
         [-0.4869, -2.1700,  0.7577, -0.3682],
         [-1.4087, -0.4176, -0.6754,  0.0763],
         [-0.1583, -0.2029,  0.6745, -0.6823],
         [ 0.4552,  0.5747, -0.3314, -1.9143],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000]]])

In [154]:
out= model(inputs_embeds=inp, attention_mask=mask)

In [155]:
inp.shape

torch.Size([1, 10, 4])

In [156]:
out[0]

tensor([[[-1.5516, -0.2037,  0.9508,  0.8045],
         [ 0.1553, -1.5379,  1.2636,  0.1190],
         [-0.6950,  0.0802, -0.9813,  1.5961],
         [-0.0377, -0.2082,  1.5238, -1.2779],
         [ 0.7606,  0.8213,  0.0738, -1.6557],
         [ 0.2063,  0.3378,  1.0872, -1.6313],
         [ 0.4727,  0.4431,  0.7993, -1.7151],
         [ 0.6379,  0.8671,  0.1718, -1.6768],
         [ 0.5959,  0.3670,  0.7528, -1.7157],
         [ 1.2472,  0.6829, -0.7249, -1.2052]]],
       grad_fn=<NativeLayerNormBackward>)

## Experiment with torch padding

In [62]:
from transformers import AlbertModel, AlbertConfig

In [63]:
configuration = AlbertConfig()
model = AlbertModel(configuration)

In [64]:
get_torch_module_num_of_parameter(model)

222595584

In [41]:
inp = torch.from_numpy(np.concatenate([np.ones((10, 5, 128)), np.zeros((10, 5, 128))], axis=1)).float()

In [44]:
out= model(inputs_embeds=inp)

In [45]:
out[0].shape

torch.Size([10, 10, 4096])

In [48]:
out[0][1].detach().numpy()

array([[-1.0415586 , -0.700448  , -0.23963146, ..., -0.34111258,
        -0.88970816,  2.0453863 ],
       [-1.0419453 , -0.7001726 , -0.24012905, ..., -0.34073353,
        -0.8900057 ,  2.0456576 ],
       [-1.0417546 , -0.7000878 , -0.24050954, ..., -0.34136513,
        -0.8902673 ,  2.0448096 ],
       ...,
       [-1.0415502 , -0.70019853, -0.23981182, ..., -0.34107703,
        -0.8901159 ,  2.0457187 ],
       [-1.0413502 , -0.6995464 , -0.240383  , ..., -0.34042382,
        -0.8905616 ,  2.0456984 ],
       [-1.0413843 , -0.70032   , -0.23944165, ..., -0.34181798,
        -0.89004594,  2.0447192 ]], dtype=float32)