In [1]:
import argparse
import pprint

import torch, gc
import torch.nn as nn
from torch import optim

from data_loader import DataLoader
import data_loader
import trainer

from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import timeit

from models.transformer import Transformer
import model_util as mu

using data from torchtext.legacy


In [2]:
def get_model(input_size, output_size, 
    hidden_size=32,
    n_splits=8,
    n_layers=4,
    dropout=0.0,
    use_transformer=True):

	if use_transformer:
		model = Transformer(
			input_size,						# Source vocabulary size
			hidden_size,				# Transformer doesn't need word_vec_size,
			output_size,					# Target vocabulary size
			n_splits=n_splits,		# Number of head in Multi-head Attention
			n_enc_blocks=n_layers,	# number of encoder blocks
			n_dec_blocks=n_layers,	# Number of decoder blocks
			dropout_p=dropout,		# Dropout rate on each block
		)
	else:
		model = Transformer(
			input_size,						# Source vocabulary size
			hidden_size,				# Transformer doesn't need word_vec_size,
			output_size,					# Target vocabulary size
			n_splits=n_splits,		# Number of head in Multi-head Attention
			n_enc_blocks=n_layers,	# number of encoder blocks
			n_dec_blocks=n_layers,	# Number of decoder blocks
			dropout_p=dropout,		# Dropout rate on each block
		)
	
	return model


def get_crit(output_size, pad_index):
	# Default weight for loss equals to 1, but we don't need to get loss for PAD token
	# Thus, set a weight for PAD to zero.
	loss_weight = torch.ones(output_size)
	loss_weight[pad_index] = 0.

	# Instead of using Cross-Entropy Loss,
	# we can use Negative Log-Likelihood(NLL) Loss with log-probability.
	print('\n Loss function: Negative Log-Likelihood with log-probability (NLLLoss)')
	crit = nn.NLLLoss(
		weight=loss_weight,
		reduction='sum',
	)

	return crit


def get_optimizer(model, 
    use_adam=True,
    use_transformer=True,
    lr=0.0001,):
	if use_adam:
		if use_transformer:
			optimizer = optim.Adam(model.parameters(), lr=lr, betas=(.9, .98))
		else: # case of rnn based seq2seq
			optimizer = optim.Adam(model.parameters(), lr=lr)
	else:
		print('Optimizer: Adam')
		optimizer = optim.Adam(model.parameters(), lr=lr, betas=(.9, .98))
	
	return optimizer

In [3]:
loader = DataLoader(
        'corpus.shuf.train.tok.bpe',
        'corpus.shuf.valid.tok.bpe',
        ('en', 'ko'),                           # Source and target language.
        batch_size=32,
        device=-1,                              # Lazy loading
        max_length=25,                          # Loger sequence will be excluded.
        dsl=False,                              # Turn-off Dual-supervised Learning mode.
    )

In [4]:
input_size, output_size = len(loader.src.vocab), len(loader.tgt.vocab)
print('\ninput_size: ', input_size)
print('output_size: ', output_size)


input_size:  24088
output_size:  43711


In [5]:
model = get_model(input_size, output_size,
    hidden_size=128,
    n_splits=8,
    n_layers=4,
    dropout=0.0,
    use_transformer=True)
print('\n', model)


 Transformer(
  (emb_enc): Embedding(24088, 64)
  (emb_dec): Embedding(43711, 64)
  (emb_dropout): Dropout(p=0.0, inplace=False)
  (encoder): MySequential(
    (0): EncoderBlock(
      (attn): MultiHead(
        (Q_linear): Linear(in_features=64, out_features=64, bias=False)
        (K_linear): Linear(in_features=64, out_features=64, bias=False)
        (V_linear): Linear(in_features=64, out_features=64, bias=False)
        (linear): Linear(in_features=64, out_features=64, bias=False)
        (attn): Attention(
          (softmax): Softmax(dim=-1)
        )
      )
      (attn_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (attn_dropout): Dropout(p=0.0, inplace=False)
      (fc): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=64, bias=True)
      )
      (fc_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (fc_dropout): Dropout(p=0.0, inplace=False)
 

In [6]:
crit = get_crit(output_size, data_loader.PAD)


 Loss function: Negative Log-Likelihood with log-probability (NLLLoss)


In [7]:
# if model_weight is not None:
    # model.load_state_dict(model_weight)

# check for available gpu
if torch.cuda.is_available():
    device_num = 0
    print('\nUsing device number: 0')
else:
    device_num = -1
    print('\nUsing device number: -1')

# Clear memory cache
gc.collect()
torch.cuda.empty_cache()

# Pass model to GPU device if it is necessary
if device_num >= 0:
    model.cuda(device_num)
    crit.cuda(device_num)


Using device number: 0


In [8]:
optimizer = get_optimizer(model, lr=0.003)

In [9]:
# if opt_weight is not None and config.use_adam:
    # optimizer.load_state_dict(opt_weight)

lr_schedular = None

In [10]:
overall_title = 'local1'

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('./tensorboard/'+overall_title+'/tests')

title = overall_title + '_02'

In [11]:
start_time = timeit.default_timer()

trainer.train(
    model,
    crit,
    optimizer,
    train_loader=loader.train_iter,
    valid_loader=loader.valid_iter,
    src_vocab=loader.src.vocab,
    tgt_vocab=loader.tgt.vocab,
    n_epochs=20,
    lr_schedular=lr_schedular,
    writer=writer,
    title=title,
)

end_time = (timeit.default_timer() - start_time) / 60.0

Start training...
 Epoch  |  Train Loss  | Train Acc  | Val Loss | Val Acc | Elapsed
--------------------------------------------------------------------------------
   1    |   6.044504   | 39.004591  | 3.652802 | 46.25  | 13.09 
   2    |   3.637714   | 59.758172  | 3.117372 | 52.50  | 12.95 
   3    |   3.190355   | 62.954866  | 3.247136 | 52.08  | 13.26 
   4    |   3.097017   | 63.014154  | 3.522850 | 50.42  | 13.13 
   5    |   3.020030   | 62.703817  | 3.542941 | 51.25  | 13.04 
   6    |   2.980572   | 62.695145  | 3.468065 | 55.00  | 13.03 
   7    |   2.926913   | 62.994757  | 3.554330 | 53.33  | 12.73 
   8    |   2.826470   | 63.440163  | 3.932808 | 53.33  | 13.06 
   9    |   2.742568   | 63.735462  | 4.061507 | 52.50  | 13.04 
  10    |   2.635356   | 64.642559  | 3.624754 | 54.17  | 12.94 
  11    |   2.552841   | 65.033947  | 3.171433 | 52.50  | 12.93 
  12    |   2.475214   | 65.624768  | 3.644714 | 51.67  | 12.97 
  13    |   2.391760   | 66.151868  | 3.488067 | 55.42

In [12]:
end_time

4.329987438216661

In [13]:
# mu.saveModel(overall_title, title, model)
# mu.graphModel(train_dataloader, model, writer, device)

In [14]:
# model = mu.getModel(overall_title, title)
# print(model)