In [1]:
import argparse
import pprint

import torch, gc
import torch.nn as nn
from torch import optim

from data_loader import DataLoader
import data_loader
import trainer
import tester
from models.transformer import Transformer
import model_util as mu

from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import timeit

using data from torchtext.legacy


In [2]:
def get_model(input_size, output_size, 
    hidden_size=32,
    n_splits=8,
    n_layers=4,
    dropout=0.2,
    use_transformer=True):

	if use_transformer:
		model = Transformer(
			input_size,						# Source vocabulary size
			hidden_size,				# Transformer doesn't need word_vec_size,
			output_size,					# Target vocabulary size
			n_splits=n_splits,		# Number of head in Multi-head Attention
			n_enc_blocks=n_layers,	# number of encoder blocks
			n_dec_blocks=n_layers,	# Number of decoder blocks
			dropout_p=dropout,		# Dropout rate on each block
		)
	else:
		model = Transformer(
			input_size,						# Source vocabulary size
			hidden_size,				# Transformer doesn't need word_vec_size,
			output_size,					# Target vocabulary size
			n_splits=n_splits,		# Number of head in Multi-head Attention
			n_enc_blocks=n_layers,	# number of encoder blocks
			n_dec_blocks=n_layers,	# Number of decoder blocks
			dropout_p=dropout,		# Dropout rate on each block
		)
	
	return model


def get_crit(output_size, pad_index):
	# Default weight for loss equals to 1, but we don't need to get loss for PAD token
	# Thus, set a weight for PAD to zero.
	loss_weight = torch.ones(output_size)
	loss_weight[pad_index] = 0.0

	# Instead of using Cross-Entropy Loss,
	# we can use Negative Log-Likelihood(NLL) Loss with log-probability.
	print('\n Loss function: Negative Log-Likelihood with log-probability (NLLLoss)')
	crit = nn.NLLLoss(
		weight=loss_weight,
		reduction='sum',
	)

	return crit


def get_optimizer(model, 
    use_adam=True,
    use_transformer=True,
    lr=0.0001,):
	if use_adam:
		if use_transformer:
			optimizer = optim.Adam(model.parameters(), lr=lr, betas=(.9, .98))
		else: # case of rnn based seq2seq
			optimizer = optim.Adam(model.parameters(), lr=lr)
	else:
		print('Optimizer: Adam')
		optimizer = optim.Adam(model.parameters(), lr=lr, betas=(.9, .98))
	
	return optimizer

In [3]:
batch_size = 64
dropout = 0.0
hidden_size = 128
lang = ('en', 'ko')
lr = 0.0003
max_length = 20
n_epochs = 30
n_layers = 4
n_splits = 8
research_num = '01'
research_subject = 'local_medium1'
test_fn = 'corpus.shuf.test.tok.bpe'
train_fn = 'corpus.shuf.train.tok.bpe'
valid_fn = 'corpus.shuf.valid.tok.bpe'

In [4]:
loader = DataLoader(
        train_fn=train_fn,
        valid_fn=valid_fn,
        test_fn=test_fn,
        exts=lang,
        batch_size=batch_size,
        device=-1,                                      # Lazy loading
        max_length=max_length,                          # Loger sequence will be excluded.
        dsl=False,                                      # Turn-off Dual-supervised Learning mode.
    )

In [5]:
input_size, output_size = len(loader.src.vocab), len(loader.tgt.vocab)
print('\ninput_size: ', input_size)
print('output_size: ', output_size)


input_size:  69459
output_size:  154233


In [6]:
model = get_model(input_size, output_size,
    hidden_size=hidden_size,
    n_splits=n_splits,
    n_layers=n_layers,
    dropout=dropout,
    use_transformer=True)

In [7]:
crit = get_crit(output_size, data_loader.PAD)


 Loss function: Negative Log-Likelihood with log-probability (NLLLoss)


In [8]:
# if model_weight is not None:
    # model.load_state_dict(model_weight)

# check for available gpu
if torch.cuda.is_available():
    device_num = 0
    print('\nUsing device number: 0')
else:
    device_num = -1
    print('\nUsing device number: -1')

# Clear memory cache
gc.collect()
torch.cuda.empty_cache()

# Pass model to GPU device if it is necessary
if device_num >= 0:
    model.cuda(device_num)
    crit.cuda(device_num)


Using device number: 0


In [9]:
optimizer = get_optimizer(model, lr=lr)

In [10]:
# if opt_weight is not None and config.use_adam:
    # optimizer.load_state_dict(opt_weight)

lr_schedular = None

In [11]:
subject_title = research_subject
title = subject_title + '_' + research_num

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('../tensorboard/'+subject_title+'/tests')

In [12]:
start_time = timeit.default_timer()

trainer.train(
    model,
    crit,
    optimizer,
    train_loader=loader.train_iter,
    valid_loader=loader.valid_iter,
    src_vocab=loader.src.vocab,
    tgt_vocab=loader.tgt.vocab,
    n_epochs=n_epochs,
    lr_schedular=lr_schedular,
    writer=writer,
    title=title,
)

end_time = (timeit.default_timer() - start_time) / 60.0

Start training...
 Epoch  |  Train Loss  | Train Acc  | Val Loss | Val Acc | Elapsed
--------------------------------------------------------------------------------
   1    |   6.644376   | 39.431375  | 4.332067 | 35.08  | 53.05 
   2    |   3.876657   | 61.164154  | 3.201270 | 40.47  | 52.07 
   3    |   3.048784   | 68.533197  | 2.686681 | 42.97  | 52.75 
   4    |   2.629587   | 72.256897  | 2.451242 | 43.91  | 52.82 
   5    |   2.396824   | 74.402395  | 2.387501 | 45.16  | 54.95 
   6    |   2.282867   | 75.562616  | 2.362514 | 45.47  | 53.80 
   7    |   2.241073   | 76.227784  | 2.311289 | 45.39  | 52.68 
   8    |   2.228202   | 76.647878  | 2.289201 | 45.39  | 53.93 
   9    |   2.231472   | 76.853735  | 2.296090 | 45.55  | 52.55 
  10    |   2.242944   | 76.913748  | 2.308893 | 45.55  | 52.86 
  11    |   2.259382   | 76.888497  | 2.327871 | 45.62  | 52.28 
  12    |   2.277931   | 76.818067  | 2.345445 | 45.47  | 52.65 
  13    |   2.297559   | 76.727367  | 2.366982 | 45.47

In [13]:
print('training time taken: ', end_time)

training time taken:  26.480890219683335


In [14]:
mu.saveModel(subject_title, title, model)
# mu.graphModel(train_dataloader, model, writer, device)

In [15]:
model = mu.getModel(subject_title, title)

In [16]:
test_loss, test_acc = tester.test(
    model,
    crit,
    test_loader=loader.test_iter,
    src_vocab=loader.src.vocab,
    tgt_vocab=loader.tgt.vocab,
    lr_schedular=lr_schedular,
)


Using device number: 0


In [17]:
print('test loss: ', test_loss)
print('test_acc: ', test_acc)

test loss:  2.876065254211426
test_acc:  47.61904761904762
