In [2]:
!pip install numpy==1.17.4 -qq
!pip install nltk==3.4.5 -qq
!pip install torchtext==0.4.0 -qq
!pip install scikit_learn==0.23.2 -qq
!pip install spacy==2.3.5 -qq
!pip install textblob==0.15.3 -qq
!pip install torch==1.6.0 -qq
!pip install torchvision==0.7.0 -qq
!pip install tqdm -qq
!pip install underthesea -qq

# Import package and prepare dataset

In [3]:
import nltk
nltk.download('wordnet')

import os
import math
import random
import argparse
from pathlib import Path
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import BucketIterator

from torchtext.data import Field, Example, Dataset
import re
from sklearn.model_selection import train_test_split
import pandas as pd
import random
import numpy as np
import torch.nn.functional as F

from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
import time
from torchtext.data import BucketIterator
import gc

path = '/kaggle/working/' #colab: /content/ or bla bla...
import sys
sys.argv=['']
del sys

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def parse_args():
    """Add arguments to parser"""
    parser = argparse.ArgumentParser(description='Verbalization dataset baseline models.')
    parser.add_argument('--model', default=RNN_NAME, type=str,
                        choices=[RNN_NAME], help='model to train the dataset')
    parser.add_argument('--input', default=QUESTION, type=str,
                        choices=[QUESTION], help='use question as input')
    parser.add_argument('--attention', default=ATTENTION_1, type=str,
                        choices=[ATTENTION_1], help='attention layer for rnn model')
    args = parser.parse_args()
    return args

def set_SEED():
    SEED = 42
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.deterministic = True

In [5]:
"""Constants for the baseline models"""
SEED = 42
QUESTION = 'question'

RNN_NAME = 'rnn'
#CNN_NAME = 'cnn'
#TRANSFORMER_NAME = 'transformer'

ATTENTION_1 = 'bahdanau'
#ATTENTION_2 = 'luong'

GPU = 'gpu'
CPU = 'cpu'
CUDA = 'cuda'

CHECKPOINT_PATH = '/model/'

ANSWER_TOKEN = '<ans>'
ENTITY_TOKEN = '<ent>'
EOS_TOKEN = '<eos>'
SOS_TOKEN = '<sos>'
PAD_TOKEN = '<pad>'

SRC_NAME = 'src'
TRG_NAME = 'trg'

In [6]:
class Checkpoint(object):
    """Checkpoint class"""
    @staticmethod
    def save(model, path):
        """Save model using name"""
        name = f'{model.name}.pt'
        torch.save(model.state_dict(), path+name)

    @staticmethod
    def load(model,path, name):
        """Load model using name"""
        #name = f'{model.name}.pt'
        model.load_state_dict(torch.load(path+name))
        return model

In [7]:
from underthesea import word_tokenize

class VerbalDataset(object):
    """VerbalDataset class"""
#     TOKENIZE_SEQ = lambda self, x: x.replace("?", " ?").\
#                                      replace(".", " .").\
#                                      replace(",", " ,").\
#                                      replace("'", " '").\
#                                      split()
                                         
    def __init__(self,train,test):
        self.train = train
        self.test = test
        self.train_data = None
        self.valid_data = None
        self.test_data = None
        self.src_field = None
        self.trg_field = None

    def _extract_question_answer2(self, train, test):
        return [[data['question'], [data['verbalized_answer'], data['verbalized_answer_2'],data['verbalized_answer_3'],data['verbalized_answer_4'],data['verbalized_answer_5'],data['verbalized_answer_6'],data['verbalized_answer_7'],data['verbalized_answer_8']]] for data in train], \
                [[data['question'], [data['verbalized_answer'], data['verbalized_answer_2'],data['verbalized_answer_3'],data['verbalized_answer_4'],data['verbalized_answer_5'],data['verbalized_answer_6'],data['verbalized_answer_7'],data['verbalized_answer_8']]] for data in test]

    def _extract_question_answer(self, train, test):
        return [[data['Question'], data['Answer']] for data in train], \
                [[data['Question'], data['Answer']] for data in test]


    def _make_torchtext_dataset(self, data, fields):
        examples = [Example.fromlist(i, fields) for i in data]
        return Dataset(examples, fields)

    def load_data_and_fields(self, ):
        """
        Load verbalization data
        Create source and target fields
        """
        train, test, val = [], [], []
        
        train = self.train
        test = self.test

        # split test data to val-test
        test, val = train_test_split(test, test_size=0.5, shuffle=False)

        # create fields
        self.src_field = Field(tokenize=word_tokenize,
                               init_token=SOS_TOKEN,
                               eos_token=EOS_TOKEN,
                               lower=True,
                               include_lengths=True,
                               batch_first=True)
        
        self.trg_field = Field(tokenize=word_tokenize,
                               init_token=SOS_TOKEN,
                               eos_token=EOS_TOKEN,
                               lower=True,
                               batch_first=True)

        fields_tuple = [(SRC_NAME, self.src_field), (TRG_NAME, self.trg_field)]

        # create toechtext datasets
        self.train_data = self._make_torchtext_dataset(train, fields_tuple)
        self.valid_data = self._make_torchtext_dataset(val, fields_tuple)
        self.test_data = self._make_torchtext_dataset(test, fields_tuple)

        # build vocabularies
        self.src_field.build_vocab(self.train_data, min_freq=2)
        self.trg_field.build_vocab(self.train_data, min_freq=2)
        print("i am field tuple",fields_tuple)

    def get_data(self):
        """Return train, validation and test data objects"""
        return self.train_data, self.valid_data, self.test_data

    def get_fields(self):
        """Return source and target field objects"""
        return self.src_field, self.trg_field

    def get_vocabs(self):
        """Return source and target vocabularies"""
        #print('self, trg field vocab: ', self.trg_field.vocab)
        return self.src_field.vocab, self.trg_field.vocab

In [8]:
set_SEED()
args = parse_args()

df = pd.read_csv('../input/covidqa/UIT-ViCOVID-QA-single.csv')
df = df[['Question','Answer']]
train, test = train_test_split(df.values,train_size=0.814,random_state=42)

dataset = VerbalDataset(train,test)

dataset.load_data_and_fields()
src_vocab, trg_vocab = dataset.get_vocabs()
train_data, valid_data, test_data = dataset.get_data()

i am field tuple [('src', <torchtext.data.field.Field object at 0x7ff6bf213250>), ('trg', <torchtext.data.field.Field object at 0x7ff6bc6e2510>)]


In [9]:
print('--------------------------------')
print(f"Training data: {len(train_data.examples)}")
print(f"Evaluation data: {len(valid_data.examples)}")
print(f"Testing data: {len(test_data.examples)}")
print('--------------------------------')
print(f'Question example: {train_data.examples[2].src}\n')
print(f'Answer example: {train_data.examples[2].trg}')
print('--------------------------------')
print(f"Unique tokens in questions vocabulary: {len(src_vocab)}")
print(f"Unique tokens in answers vocabulary: {len(trg_vocab)}")
print('--------------------------------')

--------------------------------
Training data: 3500
Evaluation data: 400
Testing data: 400
--------------------------------
Question example: ['tôi', 'đã', 'từng', 'bị', 'bệnh', 'basedow', '.', 'hôm', '15/6/2021', ',', 'tôi', 'có', 'đi', 'khám', 'tại', 'bệnh viện', ',', 'các', 'kết quả', 'xét nghiệm', 'và', 'kết luận', 'của', 'bác sĩ', 'là', 'tôi', 'bị', 'bình', 'giáp', 'và', 'chỉ', 'còn', 'rối loạn', 'mỡ', 'máu', '.', 'hiện', 'bác sĩ', 'cho', 'tôi', 'uống', 'thuốc', '.', 'tôi', 'có thể', 'tiêm', 'vaccine', 'covid-19', 'được', 'không', '?', 'xin', 'bác sĩ', 'tư vấn', 'giúp', 'tôi', '.']

Answer example: ['chào', 'anh', '/', 'chị', ',', 'trong', 'trường hợp', 'của', 'anh', '/', 'chị', 'nếu', 'trình', 'trạng', 'bệnh', 'basedow', 'đã', 'ổn', '(', 'xét nghiệm', 'kết quả', 'bình', 'giáp', ')', ',', 'chỉ', 'còn', 'rối loạn', 'mỡ', 'máu', ',', 'nếu', 'sức khỏe', 'của', 'anh', '/', 'chị', 'khi', 'khám', 'sàng lọc', 'cho', 'các', 'chỉ số', 'về', 'huyết áp', ',', 'mạch', ',', 'nhịp', 'tim', '..

# Sequence-to-Sequence Model

### Layers

In [10]:
def RNN(cell_name):
    if cell_name.lower() == 'lstm':
        return LSTM
    elif cell_name.lower() == 'gru':
        return GRU
    else:
        raise ValueError(f"Unsupported RNN Cell: {cell_name}")

def Embedding(num_embeddings, embedding_dim, padding_idx):
    """Embedding layer"""
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
    nn.init.uniform_(m.weight, -0.1, 0.1)
    nn.init.constant_(m.weight[padding_idx], 0)
    return m

def Linear(in_features, out_features, bias=True):
    """Linear layer"""
    m = nn.Linear(in_features, out_features, bias=bias)
    m.weight.data.uniform_(-0.1, 0.1)
    if bias:
        m.bias.data.uniform_(-0.1, 0.1)
    return m

def LSTM(input_size, hidden_size, **kwargs):
    """LSTM layer"""
    m = nn.LSTM(input_size, hidden_size, **kwargs)
    for name, param in m.named_parameters():
        if 'weight' in name or 'bias' in name:
            param.data.uniform_(-0.1, 0.1)
    return m

def GRU(input_size, hidden_size, **kwargs):
    """GRU layer"""
    m = nn.GRU(input_size, hidden_size, **kwargs)
    for name, param in m.named_parameters():
        if 'weight' in name or 'bias' in name:
            param.data.uniform_(-0.1, 0.1)
    return m

### Encoder

In [24]:
class Encoder(nn.Module):
    """Encoder"""
    def __init__(self, vocabulary, device, embed_dim=256, hidden_size=512,
                 num_layers=2, dropout=0.5, bidirectional=True, cell_name='gru'):
        super().__init__()
        input_dim = len(vocabulary)
        self.vocabulary = vocabulary
        self.pad_id = vocabulary.stoi[PAD_TOKEN]
        self.embed_dim= embed_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = dropout
        self.bidirectional = bidirectional
        self.cell_name = cell_name
        self.device = device

        self.embed_tokens = Embedding(input_dim, self.embed_dim, self.pad_id)

        self.rnn_cell = RNN(cell_name)
        self.rnn = self.rnn_cell(
            input_size=self.embed_dim,
            hidden_size=self.hidden_size,
            num_layers=self.num_layers,
            dropout=self.dropout if self.num_layers > 1 else 0.,
            bidirectional=self.bidirectional
        )
        self.linear_out = nn.Linear(hidden_size * 2, hidden_size)
       
    def forward(self, src_tokens, **kwargs):
        """
        Forward Encoder
        Args:
            src_tokens (LongTensor): (batch, src_len)
            src_lengths (LongTensor): (batch)
        Returns:
            x (LongTensor): (src_len, batch, hidden_size * num_directions)
            hidden (LongTensor): (batch, enc_hid_dim)
        """
        src_lengths = kwargs.get('src_lengths', '')
        src_tokens = src_tokens.t()

        x = self.embed_tokens(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training) # (src_len, batch, embed_dim)

        packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths)

        packed_outputs, hidden = self.rnn(packed_x) # hidden: (n_layers * num_directions, batch, hidden_size)

        x, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)
        x = F.dropout(x, p=self.dropout, training=self.training)

        # input hidden for decoder is the final encoder hidden state
        # since rnn is bidirectional get last forward and backward hidden state
        last_forward = hidden[-2, :, :]
        last_backward = hidden[-1, :, :]
        hidden = torch.cat((last_forward, last_backward), dim=1)

        hidden = torch.tanh(self.linear_out(hidden)) # (batch, enc_hid_dim)

        return x, hidden

### Attention

In [17]:
class Attention(nn.Module):
    """Attention"""
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()

        self.linear = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Parameter(torch.rand(dec_hid_dim))

    def forward(self, hidden, encoder_outputs, mask):
        """
        Forward Attention Layer
        Args:
            hidden (LongTensor): (batch, dec_hid_dim)
            encoder_outputs (LongTensor): (src_len, batch, enc_hid_dim * 2)
            mask (LongTensor): (batch, src_len)
        Returns:
            attention (LongTensor): (batch, src_len)
        """

        batch = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1) # (batch, src_len, dec_hid_dim)

        encoder_outputs = encoder_outputs.permute(1, 0, 2) # (batch, src_len, enc_hid_dim * 2)

        energy = torch.tanh(self.linear(torch.cat((hidden, encoder_outputs), dim=2))) # (batch, src_len, dec_hid_dim)
        energy = energy.permute(0, 2, 1) # (batch, dec_hid_dim, src_len)

        v = self.v.repeat(batch, 1).unsqueeze(1) # (batch, 1, dec_hid_dim)

        attention = torch.bmm(v, energy).squeeze(1)

        attention = attention.masked_fill(mask == 0, float('-inf'))

        return F.softmax(attention, dim=1)

### Decoder

In [18]:
class Decoder(nn.Module):
    """Decoder"""
    def __init__(self, vocabulary, device, embed_dim=256, hidden_size=512,
                 num_layers=2, dropout=0.5, max_positions=50, cell_name='gru'):
        super().__init__()
        self.vocabulary = vocabulary
        self.pad_id = vocabulary.stoi[PAD_TOKEN]
        self.sos_idx = vocabulary.stoi[SOS_TOKEN]
        self.eos_idx = vocabulary.stoi[EOS_TOKEN]
        
        self.embed_dim = embed_dim
        self.hidden_size = hidden_size
        self.need_attn = True
        self.output_dim = len(vocabulary)

        self.dropout = dropout
        self.max_positions = max_positions
        self.device = device
        self.cell_name = cell_name
        
        # suppose encoder and decoder have same hidden size
        self.attention = Attention(self.hidden_size, self.hidden_size)
        self.embed_tokens = Embedding(self.output_dim, self.embed_dim, self.pad_id)

        self.rnn_cell = RNN(cell_name)
        self.rnn = self.rnn_cell(
            input_size=(hidden_size * 2) + embed_dim,
            hidden_size=hidden_size,
        )

        self.linear_out = Linear(
            in_features=(hidden_size * 2) + hidden_size + embed_dim,
            out_features=self.output_dim
        )
        
    def _decoder_step(self, input, hidden, encoder_outputs, mask):
        input = input.unsqueeze(0) # (1, batch)

        x = self.embed_tokens(input) # (1, batch, emb_dim)
        x = F.dropout(x, p=self.dropout, training=self.training)

        attn = self.attention(hidden, encoder_outputs, mask) # (batch, src_len)
        attn = F.dropout(attn, p=self.dropout, training=self.training)

        attn = attn.unsqueeze(1) # (batch, 1, src_len)

        encoder_outputs = encoder_outputs.permute(1, 0, 2) # (batch, src_len, 2 * enc_hid_dim)

        weighted = torch.bmm(attn, encoder_outputs) # (batch, 1, 2 * enc_hid_dim)

        weighted = weighted.permute(1, 0, 2) # (1, batch, 2 * enc_hid_dim)

        rnn_input = torch.cat((x, weighted), dim=2) # (1, batch, 2 * enc_hid_dim + embed_dim)

        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        # output: (1, batch, dec_hid_dim)
        # hidden: (1, batch, dec_hid_dim)

        x = x.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)

        x = torch.cat((output, weighted, x), dim=1)
        output = self.linear_out(x) # (batch, output_dim)

        return output, hidden.squeeze(0), attn.squeeze(1)

    def forward(self, trg_tokens, encoder_out, **kwargs):
        """
        Forward Decoder
        Args:
            trg_tokens (LongTensor): (trg_len, batch)
            Tuple (encoder_out):
                encoder_out (LongTensor): (src_len, batch, 2 * hidden_size)
                hidden (LongTensor): (batch, enc_hid_dim)
            src_tokens (LongTensor): (src_len, batch)
        Returns:
            outputs (LongTensor): (max_len, batch, output_dim)
            attentions (LongTensor): (max_len, batch, src_len)
        """
        encoder_out, hidden = encoder_out
        src_tokens = kwargs.get('src_tokens', '')
        teacher_ratio = kwargs.get('teacher_forcing_ratio', '')
        src_tokens = src_tokens.t()
        batch = src_tokens.shape[1]

        if trg_tokens is None:
            teacher_ratio = 0.
            inference = True
            trg_tokens = torch.zeros((self.max_positions, batch)).long().\
                                                                  fill_(self.sos_idx).\
                                                                  to(self.device)
        else:
            trg_tokens = trg_tokens.t()
            inference = False

        max_len = trg_tokens.shape[0]

        # initialize tensors to store the outputs and attentions
        outputs = torch.zeros(max_len, batch, self.output_dim).to(self.device)
        attentions = torch.zeros(max_len, batch, src_tokens.shape[0]).to(self.device)

        # prepare decoder input(<sos> token)
        input = trg_tokens[0, :]

        mask = (src_tokens != self.pad_id).permute(1, 0) # (batch, src_len)

        for i in range(1, max_len):

            # forward through decoder using inout, encoder hidden, encoder outputs and mask
            # get predictions, hidden state and attentions
            output, hidden, attention = self._decoder_step(input, hidden, encoder_out, mask)

            # save predictions for position i
            outputs[i] = output

            # save attention for position i
            attentions[i] = attention

            # if teacher forcing
            #   use actual next token as input for next position
            # else
            #   use highest predicted token
            input = trg_tokens[i] if random.random() < teacher_ratio else output.argmax(1)

            # if inference is enabled and highest predicted token is <eos> then stop
            # and return everything till position i
            if inference and input.item() == self.eos_idx:
                return outputs[:i] # , attentions[:i]

        return outputs # , attentions

### Seq2Seq

In [19]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, name):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.name = name

    def forward(self, src_tokens, src_lengths, trg_tokens, teacher_forcing_ratio=0.5):
        encoder_out = self.encoder(src_tokens, 
                                   src_lengths=src_lengths)
        
        decoder_out = self.decoder(trg_tokens, encoder_out,
                                   src_tokens=src_tokens,
                                   teacher_forcing_ratio=teacher_forcing_ratio)
        return decoder_out

In [25]:
DEVICE = torch.device(CUDA if torch.cuda.is_available() else CPU)
encoder = Encoder(src_vocab, DEVICE)
decoder = Decoder(trg_vocab, DEVICE)
model = Seq2Seq(encoder, decoder, args.model).to(DEVICE)

parameters_num = sum(p.numel() for p in model.parameters() if p.requires_grad)

print('--------------------------------')
print(f'Model: {args.model}')
print(f'Model input: {args.input}')
#if args.model == RNN_NAME:
print(f'Attention: {args.attention}')
print(f'The model has {parameters_num:,} trainable parameters')
print('--------------------------------')

--------------------------------
Model: rnn
Model input: question
Attention: bahdanau
The model has 22,321,161 trainable parameters
--------------------------------


# Model Training

### Evaluator

In [21]:
class Evaluator(object):
    """Evaluator class"""
    def __init__(self, criterion):
        self.criterion = criterion

    def evaluate(self, model, iterator, teacher_ratio=1.0):
        model.eval()
        epoch_loss = 0
        with torch.no_grad():
            for _, batch in enumerate(iterator):
                src, src_len = batch.src
                trg = batch.trg
                input_trg = trg if model.name == RNN_NAME else trg[:, :-1]
                output = model(src, src_len, input_trg, teacher_ratio)
                trg = trg.t() if model.name == RNN_NAME else trg[:, 1:]
                output = output.contiguous().view(-1, output.shape[-1])
                trg = trg.contiguous().view(-1)
                # output: (batch_size * trg_len) x output_dim
                # trg: (batch_size * trg_len)
                loss = self.criterion(output, trg)
                epoch_loss += loss.item()
        return epoch_loss / len(iterator)

### Trainer

In [22]:
class Trainer(object):
    """Trainer Class"""
    def __init__(self, optimizer, criterion, batch_size, device):
        self.optimizer = optimizer
        self.criterion = criterion
        self.batch_size = batch_size
        self.device = device
        self.evaluator = Evaluator(criterion=self.criterion)

    def _train_batch(self, model, iterator, teacher_ratio, clip):
        model.train()
        epoch_loss = 0
        for _, batch in enumerate(tqdm_notebook(iterator)):
            src, src_len = batch.src
            trg = batch.trg
            self.optimizer.zero_grad()
            input_trg = trg if model.name == RNN_NAME else trg[:, :-1]
            output = model(src, src_len, input_trg, teacher_ratio)
            trg = trg.t() if model.name == RNN_NAME else trg[:, 1:]
            output = output.contiguous().view(-1, output.shape[-1])
            trg = trg.contiguous().view(-1)
            # output: (batch_size * trg_len) x output_dim
            # trg: (batch_size * trg_len)
            torch.cuda.empty_cache()
            loss = self.criterion(output, trg)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            self.optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / len(iterator)

    def _get_iterators(self, train_data, valid_data, model_name):
        return BucketIterator.splits((train_data, valid_data),
                                     batch_size=self.batch_size,
                                     sort_within_batch=True if model_name == RNN_NAME else \
                                                       False,
                                     sort_key=lambda x: len(x.src),
                                     device=self.device)

    def _epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def _log_epoch(self, train_loss, valid_loss, epoch, start_time, end_time):
        minutes, seconds = self._epoch_time(start_time, end_time)
        print(f'Epoch: {epoch+1:02} | Time: {minutes}m {seconds}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {np.exp(valid_loss):7.3f}')

    def _train_epoches(self, model, train_data, valid_data,  path_, num_of_epochs, teacher_ratio, clip):
        best_valid_loss = float('inf')
        # pylint: disable=unbalanced-tuple-unpacking
        train_iterator, valid_iterator = self._get_iterators(train_data, valid_data, model.name)
        for epoch in range(num_of_epochs):
            start_time = time.time()
            train_loss = self._train_batch(model, train_iterator, teacher_ratio, clip)
            valid_loss = self.evaluator.evaluate(model, valid_iterator, teacher_ratio)
            end_time = time.time()
            self._log_epoch(train_loss, valid_loss, epoch, start_time, end_time)
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                Checkpoint.save(model,path_)

    def train(self, model, train_data, valid_data, path_, num_of_epochs=20, teacher_ratio=1.0, clip=1):
        """Train model"""
        self._train_epoches(model, train_data, valid_data, path_, num_of_epochs, teacher_ratio, clip)

### Train model

In [26]:
# create optimizer
optimizer = optim.Adam(model.parameters(),lr=0.001)
# define criterion
criterion = nn.CrossEntropyLoss(ignore_index=trg_vocab.stoi[PAD_TOKEN])
# batch_size
batch_size = 12

trainer = Trainer(optimizer, criterion, batch_size, DEVICE)
trainer.train(model, train_data, valid_data, path, num_of_epochs=30)

  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 01 | Time: 5m 27s
	Train Loss: 5.158 | Train PPL: 173.737
	 Val. Loss: 4.056 |  Val. PPL:  57.763


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 02 | Time: 5m 29s
	Train Loss: 3.825 | Train PPL:  45.822
	 Val. Loss: 3.546 |  Val. PPL:  34.674


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 03 | Time: 5m 25s
	Train Loss: 3.371 | Train PPL:  29.118
	 Val. Loss: 3.320 |  Val. PPL:  27.652


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 04 | Time: 5m 25s
	Train Loss: 3.085 | Train PPL:  21.874
	 Val. Loss: 3.185 |  Val. PPL:  24.179


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 05 | Time: 5m 19s
	Train Loss: 2.860 | Train PPL:  17.470
	 Val. Loss: 3.123 |  Val. PPL:  22.711


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 06 | Time: 5m 19s
	Train Loss: 2.686 | Train PPL:  14.677
	 Val. Loss: 3.079 |  Val. PPL:  21.736


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 07 | Time: 5m 19s
	Train Loss: 2.532 | Train PPL:  12.576
	 Val. Loss: 3.050 |  Val. PPL:  21.114


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 08 | Time: 5m 23s
	Train Loss: 2.421 | Train PPL:  11.252
	 Val. Loss: 3.045 |  Val. PPL:  21.011


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 09 | Time: 5m 22s
	Train Loss: 2.300 | Train PPL:   9.971
	 Val. Loss: 3.036 |  Val. PPL:  20.826


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 10 | Time: 5m 24s
	Train Loss: 2.208 | Train PPL:   9.096
	 Val. Loss: 3.029 |  Val. PPL:  20.676


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 11 | Time: 5m 18s
	Train Loss: 2.114 | Train PPL:   8.280
	 Val. Loss: 3.037 |  Val. PPL:  20.842


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 12 | Time: 5m 22s
	Train Loss: 2.041 | Train PPL:   7.695
	 Val. Loss: 3.067 |  Val. PPL:  21.469


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 13 | Time: 5m 18s
	Train Loss: 1.970 | Train PPL:   7.170
	 Val. Loss: 3.074 |  Val. PPL:  21.627


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 14 | Time: 5m 23s
	Train Loss: 1.907 | Train PPL:   6.730
	 Val. Loss: 3.088 |  Val. PPL:  21.936


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 15 | Time: 5m 17s
	Train Loss: 1.838 | Train PPL:   6.284
	 Val. Loss: 3.107 |  Val. PPL:  22.358


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 16 | Time: 5m 21s
	Train Loss: 1.784 | Train PPL:   5.951
	 Val. Loss: 3.120 |  Val. PPL:  22.640


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 17 | Time: 5m 19s
	Train Loss: 1.728 | Train PPL:   5.629
	 Val. Loss: 3.161 |  Val. PPL:  23.588


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 18 | Time: 5m 20s
	Train Loss: 1.683 | Train PPL:   5.384
	 Val. Loss: 3.167 |  Val. PPL:  23.737


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 19 | Time: 5m 20s
	Train Loss: 1.637 | Train PPL:   5.142
	 Val. Loss: 3.205 |  Val. PPL:  24.654


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 20 | Time: 5m 17s
	Train Loss: 1.589 | Train PPL:   4.901
	 Val. Loss: 3.217 |  Val. PPL:  24.956


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 21 | Time: 5m 19s
	Train Loss: 1.549 | Train PPL:   4.706
	 Val. Loss: 3.251 |  Val. PPL:  25.805


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 22 | Time: 5m 19s
	Train Loss: 1.508 | Train PPL:   4.517
	 Val. Loss: 3.265 |  Val. PPL:  26.171


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 23 | Time: 5m 22s
	Train Loss: 1.476 | Train PPL:   4.376
	 Val. Loss: 3.302 |  Val. PPL:  27.178


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 24 | Time: 5m 17s
	Train Loss: 1.443 | Train PPL:   4.234
	 Val. Loss: 3.317 |  Val. PPL:  27.574


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 25 | Time: 5m 17s
	Train Loss: 1.406 | Train PPL:   4.078
	 Val. Loss: 3.345 |  Val. PPL:  28.371


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 26 | Time: 5m 21s
	Train Loss: 1.374 | Train PPL:   3.951
	 Val. Loss: 3.373 |  Val. PPL:  29.179


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 27 | Time: 5m 19s
	Train Loss: 1.343 | Train PPL:   3.831
	 Val. Loss: 3.393 |  Val. PPL:  29.758


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 28 | Time: 5m 21s
	Train Loss: 1.320 | Train PPL:   3.743
	 Val. Loss: 3.424 |  Val. PPL:  30.706


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 29 | Time: 5m 19s
	Train Loss: 1.294 | Train PPL:   3.646
	 Val. Loss: 3.432 |  Val. PPL:  30.944


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch: 30 | Time: 5m 20s
	Train Loss: 1.276 | Train PPL:   3.582
	 Val. Loss: 3.473 |  Val. PPL:  32.229


# Predict and Evaluation

### BleuScorer (gồm BLEU và METEOR)

In [27]:
import nltk

class BleuScorer(object):
    """Blue scorer class"""
    def __init__(self):
        self.results = []
        self.results_meteor = []
        self.score = 0
        self.meteor_score = 0
        self.instances = 0
        self.meteor_instances = 0

    def example_score(self, reference, hypothesis):
        """Calculate blue score for one example"""
        return nltk.translate.bleu_score.sentence_bleu([reference], hypothesis,weights=(1,0,0,0)) #unigram
    
    def example_score_meteor(self, reference, hypothesis):
        """Calculate blue score for one example"""
        return nltk.translate.meteor_score.single_meteor_score(reference,hypothesis)

    def data_score(self, data, predictor, path):
        """Score complete list of data"""
        results_prelim = []
        for example in tqdm_notebook(data):
            #i = 1
            src = [t.lower() for t in example.src]
            reference = [t.lower() for t in example.trg]
            # loop through example.src and calculate all hypothesis(max. 8) 
            #and calculate blue score average of all hypothesis
            hypothesis = predictor.predict(example.src)
            blue_score = self.example_score(reference, hypothesis)
            meteor_score = self.example_score_meteor(' '.join(reference), ' '.join(hypothesis))
            #print('Blue Score: ',blue_score)
            results_prelim.append({
                'question': '"' + str(src) + '"',
                'reference': reference,
                'hypothesis': hypothesis,
                'blue_score': blue_score,
                'meteor_score': meteor_score
            })
        #print('List length before aggregation',len(results_prelim))

        results = [max((v for v in results_prelim if v['question'] == x), key=lambda y:y['blue_score']) for x in set(v['question'] for v in results_prelim)] 

        with open(path+'result_output.txt', 'w') as f:
            for elem in results:
                f.write("%s\n" % elem)
                self.results.append(elem)
                self.score += elem['blue_score']
                self.meteor_score += elem['meteor_score']
                self.instances += 1
        return self.score / self.instances, self.meteor_score / self.instances

    def average_score(self):
        """Return bleu average score"""
        return self.score / self.instances
    
    def data_meteor_score(self, data, predictor, path):
        """Score complete list of data"""
        results_prelim = []
        for example in data:
            src = [t.lower() for t in example.src]
            reference = [t.lower() for t in example.trg]
            hypothesis = predictor.predict(example.src)
            meteor_score = self.example_score_meteor(' '.join(reference), ' '.join(hypothesis))
            results_prelim.append({
                'question': '"' + str(src) + '"',
                'reference': reference,
                'hypothesis': hypothesis,
                'meteor_score': meteor_score
            })
        results_meteor = [max((v for v in results_prelim if v['question'] == x), key=lambda y:y['meteor_score']) for x in set(v['question'] for v in results_prelim)] 

        with open(path+'result_meteor_output.txt', 'w') as f:
            for elem in results_meteor:
                f.write("%s\n" % elem)
                self.results_meteor.append(elem)
                self.meteor_score += elem['meteor_score']
                self.meteor_instances += 1
        return self.meteor_score/self.meteor_instances
    
    def average_meteor_score(self):
        """Return meteor average score"""
        return self.meteor_score/self.instances

    def reset(self):
        """Reset object properties"""
        self.results = []
        self.results_meteor = []
        self.score = 0
        self.meteor_score = 0
        self.instances = 0
        self.meteor_instances = 0

### Predictor

In [28]:
class Predictor(object):
    """Predictor class"""
    def __init__(self, model, src_vocab, trg_vocab, device):
        self.model = model
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.device = device

    def _predict_step(self, tokens):
        self.model.eval()
        tokenized_sentence = [SOS_TOKEN] + [t.lower() for t in tokens] + [EOS_TOKEN]
        numericalized = [self.src_vocab.stoi[token] for token in tokenized_sentence]
        src_tensor = torch.LongTensor(numericalized).unsqueeze(0).to(self.device)

        with torch.no_grad():
            encoder_out = self.model.encoder(src_tensor)

        outputs = [self.trg_vocab.stoi[SOS_TOKEN]]

        # cnn positional embedding gives assertion error for tensor
        # of size > max_positions-1, we predict tokens for max_positions-2
        # to avoid the error
        for _ in range(self.model.decoder.max_positions-2):
            trg_tensor = torch.LongTensor(outputs).unsqueeze(0).to(self.device)

            with torch.no_grad():
                output = self.model.decoder(trg_tensor, encoder_out, src_tokens=src_tensor)

            prediction = output.argmax(2)[:, -1].item()

            if prediction == self.trg_vocab.stoi[EOS_TOKEN]:
                break

            outputs.append(prediction)

        translation = [self.trg_vocab.itos[i] for i in outputs]

        return translation[1:] # , attention

    def _predict_rnn_step(self, tokens):
        self.model.eval()
        with torch.no_grad():
            tokenized_sentence = [SOS_TOKEN] + [t.lower() for t in tokens] + [EOS_TOKEN]
            numericalized = [self.src_vocab.stoi[t] for t in tokenized_sentence]

            src_len = torch.LongTensor([len(numericalized)]).to(self.device)
            tensor = torch.LongTensor(numericalized).unsqueeze(1).to(self.device)

            translation_tensor_logits = self.model(tensor.t(), src_len, None)

            translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
            translation = [self.trg_vocab.itos[t] for t in translation_tensor]

        return translation[1:] # , attention

    def predict(self, tokens):
        """Perform prediction on given tokens"""
        return self._predict_rnn_step(tokens) if self.model.name == RNN_NAME else \
                self._predict_step(tokens)

### Predict and Evaluate

In [29]:
model = Checkpoint.load(model,path,'./rnn.pt') # chọn path và tên model cho phù hợp

valid_iterator, test_iterator = BucketIterator.splits(
                                    (valid_data, test_data),
                                    batch_size=12,
                                    sort_within_batch=True, #if args.model == RNN_NAME else False,
                                    sort_key=lambda x: len(x.src),
                                    device=DEVICE)

# evaluate model
valid_loss = trainer.evaluator.evaluate(model, valid_iterator)
test_loss = trainer.evaluator.evaluate(model, test_iterator)

# calculate blue score for valid and test data
predictor = Predictor(model, src_vocab, trg_vocab, DEVICE)

valid_scorer = BleuScorer()
test_scorer = BleuScorer()
train_scorer = BleuScorer()
#bleu score
train_scorer.data_score(train_data.examples, predictor,path)
valid_scorer.data_score(valid_data.examples, predictor,path)
test_scorer.data_score(test_data.examples, predictor,path)

  0%|          | 0/3500 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

(0.1458248373974774, 0.16883767768741276)

In [30]:
print(f'| Train Data Average BLEU score {train_scorer.average_score()*100} |')
print(f'| Train Data Average METEOR score {train_scorer.average_meteor_score()*100} |\n')
print(f'| Val. Loss: {valid_loss:.3f} | Test PPL: {math.exp(valid_loss):7.3f} |')
print(f'| Val. Data Average BLEU score {valid_scorer.average_score()*100} |')
print(f'| Val. Data Average METEOR score {valid_scorer.average_meteor_score()*100} |\n')
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')
print(f'| Test Data Average BLEU score {test_scorer.average_score()*100} |')
print(f'| Test Data Average METEOR score {test_scorer.average_meteor_score()*100} |')

| Train Data Average BLEU score 17.013512578806537 |
| Train Data Average METEOR score 19.98303362328471 |

| Val. Loss: 3.020 | Test PPL:  20.484 |
| Val. Data Average BLEU score 14.551902694694713 |
| Val. Data Average METEOR score 17.282456672258338 |

| Test Loss: 3.051 | Test PPL:  21.146 |
| Test Data Average BLEU score 14.58248373974774 |
| Test Data Average METEOR score 16.883767768741276 |


In [31]:
i = 5
ref = [t.lower() for t in test_data.examples[i].trg]
source = [t.lower() for t in test_data.examples[i].src]
hyp = predictor.predict(source)

print(" ".join(test_data.examples[i].src),'\n')
print(" ".join(ref),'\n')
print(" ".join(hyp),'\n')
print('BLEU unigram:',nltk.translate.bleu_score.sentence_bleu([ref],hyp,weights=(1,0,0,0))) #unigram
print('METEOR:',nltk.translate.meteor_score.single_meteor_score(" ".join(ref)," ".join(hyp)))


vào ngày 3/5 tôi có tiêm mũi đầu vaccine astrazeneca và mũi 2 vào ngày 16/6 đến ngày 23/6 tôi phát hiện mình có thai 5 tuần . tôi không biết có ảnh hưởng tới thai không ? 

chào chị , khuyến cáo của tổ chức y tế thế giới ( who ) vẫn tiêm vaccine covid-19 cho phụ nữ mang thai và đang cho con bú và chưa có bằng chứng nào được ghi nhận về ảnh hưởng của vaccine lên sức khỏe của thai nhi . tuy nhiên , hướng dẫn của bộ y tế nên hoãn tiêm cho các đối tượng phụ nữ mang thai và cho con bú . do đó , việc chị tiêm vaccine covid-19 xong mới phát hiện mình có thai cũng không nên lo lắng lắm , chị cần đi khám thai định kỳ để theo dõi sức khỏe thai nhi . cảm ơn câu hỏi của chị . trân trọng ! 

chào chị , theo khuyến cáo của bộ y tế , những người có cơ địa dị ứng , tức là những người mắc các bệnh dị ứng như hen phế quản , tiểu đường , tiểu đường , ... nên hoãn tiêm mũi vaccine covid-19 . chị nên đến gặp bác sĩ để được bác sĩ thăm khám và 

BLEU unigram: 0.18390089060677464
METEOR: 0.170093652790512


In [32]:
for ex in test_data.examples[:50]:
    src_tmp = [t.lower() for t in ex.src]

    reference_tmp = [t.lower() for t in ex.trg]

    hypothesis_tmp = predictor.predict(ex.src)
    print('Question:'," ".join(src_tmp),'\n')
    print('Reference answer:'," ".join(reference_tmp),'\n')
    print('Hypothesis answer:'," ".join(hypothesis_tmp))
    print('_______________________________________________________________________________________________________________________________________________________________________\n')

Question: 17 ngày sau khi tiêm liều một vaccine astrazeneca tôi bị đau buốt từ mông đến đùi trước và đùi sau , đặc biệt đau khi ngồi lâu hoặc đang từ nằm chuyển sang đi . tôi uống thuốc thì thấy giảm đau . bị đau có phải do phản ứng của vaccine ? một ngày sau tiêm tôi chỉ bị hơi ớn lạnh + hơi đau lưng . cám ơn bác sĩ . 

Reference answer: chào bạn , giống như với tất cả các loại vaccine khác , vaccine covid-19 có thể gây ra các phản ứng phụ , mặc dù không phải ai cũng gặp phải . trong các nghiên cứu lâm sàng , hầu hết các tác dụng phụ không mong muốn của vaccine covid-19 đều ở mức độ từ nhẹ đến trung bình và đều được giải quyết trong vòng vài ngày . trường hợp bạn đang gặp phải là triệu chứng đau buốt vùng đùi , cẳng chân xuất hiện muộn sau tiêm chủng , mặc dù ít gặp nhưng có thể là một trong số những phản ứng không mong muốn sau tiêm . bạn cần theo dõi triệu chứng này trong vòng 2-3 ngày , nếu tình trạng không cải thiện hoặc có xu hướng đau tăng lên , phù ... thì bạn nên đến bệnh viện