# Nucleus e Top K Decoding para tradução


Neste colab iremos utilizar duas técnicas de decoding (top k e nucleus) para a tarefa de tradução.


In [None]:
! pip install transformers
! pip install --quiet pytorch-lightning==0.7.5
! pip install sacrebleu --quiet



In [None]:
# Importar todos os pacotes de uma só vez para evitar duplicados ao longo do notebook.
import gzip
import nvidia_smi
import os
import random
import torch
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

import sacrebleu
from google.colab import drive

from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from typing import Dict
from typing import List
from typing import Tuple

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Important: Fix seeds so we can replicate results
seed = 123
random.seed(seed) 
# np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

DICA para modelos reais: Um modelo otimizado deve manter o uso de GPU próximo a 100% durante o treino.
Vamos utilizar a bilioteca abaixo para monitorar isso. Note que no modelo simples utilizado aqui o uso não vai chegar a 100%.

In [None]:
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
print(f"Device name: {nvidia_smi.nvmlDeviceGetName(handle)}")

def gpu_usage():
    global handle
    return str(nvidia_smi.nvmlDeviceGetUtilizationRates(handle).gpu) + '%'

Device name: b'Tesla T4'


In [None]:
# Configurações gerais
model_name = "t5-small"
batch_size = 2
source_max_length = 11
target_max_length = 7

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json from cache at /root/.cache/torch/transformers/26561bc9e840d8945f475d0d4c4b9df32025eadd79894b867b570cb1d09e67a9.3817cc1260a6b941b17af62b4f2a942b9825f209d8e2eed99e79e96f85f59aab
INFO:transformers.configuration_utils:Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 8,
  "nu

In [None]:
# Importante: adicionar end-of-sequence token.
source = ['translate English to German: I like pizza </s>','translate English to German: I dont care </s>']

source_encoded = tokenizer.batch_encode_plus(
    source, add_special_tokens=True,
    max_length=source_max_length,
    pad_to_max_length=True,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_tensors='pt')
source_token_ids = source_encoded['input_ids']
source_mask = source_encoded['attention_mask']
source_token_ids

tensor([[13959,  1566,    12,  2968,    10,    27,   114,  6871,     1,     0,
             0],
        [13959,  1566,    12,  2968,    10,    27,  2483,   124,     1,     0,
             0]])

# Implementação greedy decoding do model.generate()

In [None]:
decoded_ids = torch.full((source_token_ids.shape[0], 1),
                          model.config.decoder_start_token_id,
                          dtype=torch.long).to(source_token_ids.device)

encoder_hidden_states = model.get_encoder()(source_token_ids,
                                            attention_mask=source_mask)
print('encoder_hidden_states[0].shape', encoder_hidden_states[0].shape)
print('decoded_ids.shape', decoded_ids.shape)

for step in range(target_max_length):
    logits, _, _ = model(decoder_input_ids=decoded_ids,
                      encoder_outputs=encoder_hidden_states,
                      attention_mask=source_mask)

    next_token_logits = logits[:, -1, :]
    next_token_id = next_token_logits.argmax(1).unsqueeze(-1)
    decoded_ids = torch.cat([decoded_ids, next_token_id], dim=-1)
    print('-' * 50)
    print('step', step)
    print('logits.shape', logits.shape)
    print('decoded_ids.shape', decoded_ids.shape)

print('decoded tokens: ', tokenizer.convert_ids_to_tokens(decoded_ids[0]))
print('final: ', tokenizer.decode(decoded_ids[0]))
print('decoded tokens: ', tokenizer.convert_ids_to_tokens(decoded_ids[1]))
print('final: ', tokenizer.decode(decoded_ids[1]))

encoder_hidden_states[0].shape torch.Size([2, 11, 512])
decoded_ids.shape torch.Size([2, 1])
--------------------------------------------------
step 0
logits.shape torch.Size([2, 1, 32128])
decoded_ids.shape torch.Size([2, 2])
--------------------------------------------------
step 1
logits.shape torch.Size([2, 2, 32128])
decoded_ids.shape torch.Size([2, 3])
--------------------------------------------------
step 2
logits.shape torch.Size([2, 3, 32128])
decoded_ids.shape torch.Size([2, 4])
--------------------------------------------------
step 3
logits.shape torch.Size([2, 4, 32128])
decoded_ids.shape torch.Size([2, 5])
--------------------------------------------------
step 4
logits.shape torch.Size([2, 5, 32128])
decoded_ids.shape torch.Size([2, 6])
--------------------------------------------------
step 5
logits.shape torch.Size([2, 6, 32128])
decoded_ids.shape torch.Size([2, 7])
--------------------------------------------------
step 6
logits.shape torch.Size([2, 7, 32128])
decode

# Validando com a implementação do HuggingFace-transformers

In [None]:
outputs = model.generate(input_ids=source_token_ids,
                         attention_mask=source_mask,
                         max_length=target_max_length,
                         do_sample=False)
print('decoded tokens: ', tokenizer.convert_ids_to_tokens(outputs[0]))
print('final: ', tokenizer.decode(outputs[0]))
print('decoded tokens: ', tokenizer.convert_ids_to_tokens(outputs[1]))
print('final: ', tokenizer.decode(outputs[1]))

decoded tokens:  ['<pad>', '▁Ich', '▁mag', '▁Pizza', '</s>', '<pad>', '<pad>']
final:  Ich mag Pizza
decoded tokens:  ['<pad>', '▁Ich', '▁', 'm', 'uß', '▁mich', '▁nicht']
final:  Ich muß mich nicht


# Criando funções de decodificação

In [None]:
def nucleus_sampling(logits,p = torch.tensor(0.95)):

  #Definindo probabilidades e ordenando-as
  probs = F.softmax(logits)
  probs_sorted, indices = torch.sort(probs, descending=True)
  
  #Calculando a probabilidade acumulada e comparando com o threshold
  probs_cum_sum = torch.cumsum(probs_sorted,dim=-1)
  
  probs_greater_than_p = probs_cum_sum > p
  print(probs_greater_than_p)
  one_token_prob = probs_greater_than_p.all(-1)
  probs_greater_than_p[one_token_prob,0] = False 
  #Atribuindo zero para tokens não candidatos, e normalizando probabilidade para candidatos
  probs_sorted[probs_greater_than_p] = 0
  probs_sorted  = probs_sorted/p 
  
  #Amostrando de acordo com a probabilidade 
  print(probs_sorted)
  token_index = torch.multinomial(probs_sorted,1)
  predicted_tokens = indices[0][token_index]

  return predicted_tokens

def topk_sampling(logits, k = 10):

  logits, tokens = torch.topk(logits,k,dim=-1) # Selecionando os k maiores logitos.
  probs = F.softmax(logits) # Calculo da probabilidade
  token_index = torch.multinomial(probs,1) #Amostrando de acordo com a probabilidade
  print(tokens.shape)
  print(token_index.shape)
  predicted_tokens = torch.gather(tokens,-1,token_index)
  print(predicted_tokens.shape)
  return predicted_tokens  

# Teste de "Top-K Sampling"

In [None]:
decoded_ids = torch.full((source_token_ids.shape[0], 1),
                          model.config.decoder_start_token_id,
                          dtype=torch.long).to(source_token_ids.device)

encoder_hidden_states = model.get_encoder()(source_token_ids,
                                            attention_mask=source_mask)
print('encoder_hidden_states[0].shape', encoder_hidden_states[0].shape)
print('decoded_ids.shape', decoded_ids.shape)

for step in range(target_max_length):
    logits, _, _ = model(decoder_input_ids=decoded_ids,
                      encoder_outputs=encoder_hidden_states,
                      attention_mask=source_mask)
    next_token_logits = logits[:, -1, :]
    
    next_token_id = topk_sampling(next_token_logits)
    # print(next_token_id.shape)
    decoded_ids = torch.cat([decoded_ids, next_token_id], dim=-1)

    print('-' * 50)
    print('step', step)
    print('logits.shape', logits.shape)
    print('decoded_ids.shape', decoded_ids.shape)

print('decoded tokens: ', tokenizer.convert_ids_to_tokens(decoded_ids[0]))
print('final: ', tokenizer.decode(decoded_ids[0]))
print('decoded tokens: ', tokenizer.convert_ids_to_tokens(decoded_ids[1]))
print('final: ', tokenizer.decode(decoded_ids[1]))

encoder_hidden_states[0].shape torch.Size([2, 11, 512])
decoded_ids.shape torch.Size([2, 1])
torch.Size([2, 10])
torch.Size([2, 1])
torch.Size([2, 1])
--------------------------------------------------
step 0
logits.shape torch.Size([2, 1, 32128])
decoded_ids.shape torch.Size([2, 2])
torch.Size([2, 10])
torch.Size([2, 1])
torch.Size([2, 1])
--------------------------------------------------
step 1
logits.shape torch.Size([2, 2, 32128])
decoded_ids.shape torch.Size([2, 3])
torch.Size([2, 10])
torch.Size([2, 1])
torch.Size([2, 1])
--------------------------------------------------
step 2
logits.shape torch.Size([2, 3, 32128])
decoded_ids.shape torch.Size([2, 4])




torch.Size([2, 10])
torch.Size([2, 1])
torch.Size([2, 1])
--------------------------------------------------
step 3
logits.shape torch.Size([2, 4, 32128])
decoded_ids.shape torch.Size([2, 5])
torch.Size([2, 10])
torch.Size([2, 1])
torch.Size([2, 1])
--------------------------------------------------
step 4
logits.shape torch.Size([2, 5, 32128])
decoded_ids.shape torch.Size([2, 6])
torch.Size([2, 10])
torch.Size([2, 1])
torch.Size([2, 1])
--------------------------------------------------
step 5
logits.shape torch.Size([2, 6, 32128])
decoded_ids.shape torch.Size([2, 7])
torch.Size([2, 10])
torch.Size([2, 1])
torch.Size([2, 1])
--------------------------------------------------
step 6
logits.shape torch.Size([2, 7, 32128])
decoded_ids.shape torch.Size([2, 8])
decoded tokens:  ['<pad>', '▁Ich', '▁mag', '▁Pizza', '</s>', '</s>', '.', '</s>']
final:  Ich mag Pizza.
decoded tokens:  ['<pad>', '▁Ich', '▁', 'm', 'uß', '▁mich', '▁nicht', '▁um']
final:  Ich muß mich nicht um


# Validando com a implementação do HuggingFace-transformers

In [None]:
outputs = model.generate(input_ids=source_token_ids,
                         attention_mask=source_mask,
                         max_length=target_max_length,
                         do_sample=True,
                         top_k = 10)
print('decoded tokens: ', tokenizer.convert_ids_to_tokens(outputs[0]))
print('final: ', tokenizer.decode(outputs[0]))
print('decoded tokens: ', tokenizer.convert_ids_to_tokens(outputs[1]))
print('final: ', tokenizer.decode(outputs[1]))

decoded tokens:  ['<pad>', '▁Ich', '▁mag', '▁Pizza', '</s>', '<pad>', '<pad>']
final:  Ich mag Pizza
decoded tokens:  ['<pad>', '▁Ich', '▁', 't', 'u', 'e', '▁mich']
final:  Ich tue mich


# Teste de "Nucleus Sampling" 


In [None]:
decoded_ids = torch.full((source_token_ids.shape[0], 1),
                          model.config.decoder_start_token_id,
                          dtype=torch.long).to(source_token_ids.device)

encoder_hidden_states = model.get_encoder()(source_token_ids,
                                            attention_mask=source_mask)
print('encoder_hidden_states[0].shape', encoder_hidden_states[0].shape)
print('decoded_ids.shape', decoded_ids.shape)

for step in range(target_max_length):
    logits, _, _ = model(decoder_input_ids=decoded_ids,
                      encoder_outputs=encoder_hidden_states,
                      attention_mask=source_mask)
    next_token_logits = logits[:, -1, :]
    next_token_id = nucleus_sampling(next_token_logits)
    decoded_ids = torch.cat([decoded_ids, next_token_id], dim=-1)

    print('-' * 50)
    print('step', step)
    print('logits.shape', logits.shape)
    print('decoded_ids.shape', decoded_ids.shape)

print('decoded tokens: ', tokenizer.convert_ids_to_tokens(decoded_ids[0]))
print('final: ', tokenizer.decode(decoded_ids[0]))

encoder_hidden_states[0].shape torch.Size([2, 11, 512])
decoded_ids.shape torch.Size([2, 1])
tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True]])
tensor([[0.4553, 0.0759, 0.0307,  ..., 0.0000, 0.0000, 0.0000],
        [0.3804, 0.2212, 0.0561,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<DivBackward0>)
--------------------------------------------------
step 0
logits.shape torch.Size([2, 1, 32128])
decoded_ids.shape torch.Size([2, 2])
tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True]])
tensor([[0.9113, 0.0408, 0.0277,  ..., 0.0000, 0.0000, 0.0000],
        [0.3041, 0.2391, 0.0777,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<DivBackward0>)
--------------------------------------------------
step 1
logits.shape torch.Size([2, 2, 32128])
decoded_ids.shape torch.Size([2, 3])
tensor([[False,  True,  True,  ...,  True,  True,  True],
        [ True,  True,  Tru

  after removing the cwd from sys.path.


tensor([[False,  True,  True,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True]])
tensor([[0.9587, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1072, 0.1020, 0.0383,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<DivBackward0>)
--------------------------------------------------
step 3
logits.shape torch.Size([2, 4, 32128])
decoded_ids.shape torch.Size([2, 5])
tensor([[False,  True,  True,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True]])
tensor([[0.9858, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.7113, 0.0426, 0.0153,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<DivBackward0>)
--------------------------------------------------
step 4
logits.shape torch.Size([2, 5, 32128])
decoded_ids.shape torch.Size([2, 6])
tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True]])
tensor([[0.5548, 0.0448, 0.0425,  ..., 0.0000, 0.0000, 0.0000

# Integrando sampling para o modelo treinado

In [None]:
# Configurações gerais
model_name = "t5-small"
batch_size = 8
accumulate_grad_batches = 8
source_max_length = 128
target_max_length = 128
learning_rate = 5e-3

In [None]:
class T5Finetuner(pl.LightningModule):

    def __init__(self, tokenizer, train_dataloader, val_dataloader,
                 test_dataloader, learning_rate, target_max_length=32, 
                 custom_decoding=False, mode_decoding='greedy'):
        super(T5Finetuner, self).__init__()
        
        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
        self._test_dataloader = test_dataloader

        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        
        self.tokenizer = tokenizer
        self.learning_rate = learning_rate
        self.target_max_length = target_max_length

        self.custom_decoding = custom_decoding
        if mode_decoding in ['greedy','topk','nucleus']:
          self.mode_decoding = mode_decoding
        else:
          print("Decoding mode not recognized. Using greedy decoding")
          seelf.mode_decoding = 'greedy'

    def nucleus_sampling(self,logits,p = torch.tensor(0.95)):

      #Definindo probabilidades e ordenando-as
      probs = F.softmax(logits)
      probs_sorted, indices = torch.sort(probs, descending=True)
      
      #Calculando a probabilidade acumulada e comparando com o threshold
      probs_cum_sum = torch.cumsum(probs_sorted,dim=-1)
      probs_greater_than_p = probs_cum_sum > p
      
      #Tratando o caso em que o primeiro token já tem mais que o limiar.
      one_token_prob = probs_greater_than_p.all(-1)
      probs_greater_than_p[one_token_prob,0] = False 

      #Atribuindo zero para tokens não candidatos, e normalizando probabilidade para candidatos
      probs_sorted[probs_greater_than_p] = 0
      probs_sorted  = probs_sorted/p 
      
      #Amostrando de acordo com a probabilidade 
      token_index = torch.multinomial(probs_sorted,1)
      predicted_tokens = torch.gather(indices,-1,token_index)

      return predicted_tokens

    def topk_sampling(self,logits, k = 10):

      logits, tokens = torch.topk(logits,k,dim=-1) # Selecionando os k maiores logitos.
      probs = F.softmax(logits) # Calculo da probabilidade
      token_index = torch.multinomial(probs,1) #Amostrando de acordo com a probabilidade
      predicted_tokens = torch.gather(tokens,-1,token_index)
      return predicted_tokens  

    def custom_decode(self,source_token_id, source_mask):
      decoded_ids = torch.full((source_token_id.shape[0], 1),
                          self.model.config.decoder_start_token_id,
                          dtype=torch.long).to(source_token_id.device)
      
      encoder_hidden_states = self.model.get_encoder()(source_token_id,
                                            attention_mask=source_mask)
      for step in range(self.target_max_length):
        logits, _, _ = self.model(decoder_input_ids=decoded_ids,
                          encoder_outputs=encoder_hidden_states,
                          attention_mask=source_mask)
        next_token_logits = logits[:, -1, :]

        if self.mode_decoding=='topk':
          next_token_id = self.topk_sampling(next_token_logits)
        elif self.mode_decoding == 'nucleus':
          next_token_id = self.nucleus_sampling(next_token_logits)
        elif self.mode_decoding == 'greedy':
          next_token_id = next_token_logits.argmax(-1).unsqueeze(-1)
        # print(next_token_id.shape)
        # print(decoded_ids.shape)
        decoded_ids = torch.cat([decoded_ids, next_token_id], dim=-1)

        # if next_token_id == self.model.config.eos_token_id:
        #   return decoder_ids
      
      return  decoded_ids

    def forward(self, source_token_ids, source_mask, target_token_ids=None,
                target_mask=None):
       
        if self.training:
          loss, _, _, _ = self.model(input_ids=source_token_ids, attention_mask= source_mask, lm_labels=target_token_ids)
          return loss
        else:
          if self.custom_decoding:
            predicted_token_ids =  self.custom_decode(source_token_ids, source_mask)
          else:
            if self.mode_decoding == 'greedy':
              predicted_token_ids = self.model.generate(input_ids=source_token_ids, 
                                                        max_length=self.target_max_length)
            elif self.mode_decoding == 'topk':
              predicted_token_ids = self.model.generate(input_ids=source_token_ids, 
                                                        max_length=self.target_max_length,
                                                        do_sample = True,
                                                        top_k = 10)

            elif self.mode_decoding == 'nucleus':
              predicted_token_ids = self.model.generate(input_ids=source_token_ids, 
                                          max_length=self.target_max_length,
                                          do_sample = True,
                                          top_p = 0.95,
                                          top_k = 0)

          return predicted_token_ids

    def training_step(self, batch, batch_nb):
        # batch
        source_token_ids, source_mask, target_token_ids, target_mask, _, _ = batch
         
        # fwd
        loss = self(source_token_ids, source_mask, target_token_ids, target_mask)

        # logs
        tensorboard_logs = {'train_loss': loss}
        progress_bar = {'gpu_usage': gpu_usage()}
        return {'loss': loss, 'log': tensorboard_logs,
                'progress_bar': progress_bar}
    
    def validation_step(self, batch, batch_nb):
        source_token_ids, source_mask, target_token_ids, target_mask, source, target = batch
        target = list(target)
        
        #Calculo BLEU
        tokens_predicted = self(source_token_ids, source_mask, target_token_ids, target_mask)
        sentences_predicted = [self.tokenizer.decode(x) for x in tokens_predicted.tolist()]
        bleu_i = sacrebleu.corpus_bleu(sentences_predicted, [target])
        
        avg_bleu = bleu_i.score
        
        # Dicionario para visualizar posteriormente
        sentences = {}
        sentences['Target'] = target[0]
        sentences['Source'] = source[0]
        sentences['Predicted'] = sentences_predicted[0]

        progress_bar = {'gpu_usage': gpu_usage()}
        
        return {'val_bleu': avg_bleu, 'progress_bar': progress_bar, 'sentences': sentences}

    def test_step(self, batch, batch_nb):
        source_token_ids, source_mask, target_token_ids, target_mask, source, target = batch

        #Calculo BLEU
        tokens_predicted = self(source_token_ids, source_mask, target_token_ids, target_mask)
        sentences_predicted = [self.tokenizer.decode(x) for x in tokens_predicted.tolist()]
        bleu_i = sacrebleu.corpus_bleu(sentences_predicted, [target])
        avg_bleu = bleu_i.score

        # Dicionario para visualizar posteriormente
        sentences = {}
        sentences['Target'] = target[0]
        sentences['Source'] = source[0]
        sentences['Predicted'] = sentences_predicted[0]

        progress_bar = {'gpu_usage': gpu_usage()}

        return {'test_bleu': avg_bleu, 'progress_bar': progress_bar, 'sentences': sentences}

    def validation_epoch_end(self, outputs):
        avg_bleu = sum([x['val_bleu'] for x in outputs]) / len(outputs)
        sentences_dict = [x['sentences'] for x in outputs]
        
        for i in range(3):
          print('\n\n')
          print("Source:{}".format(sentences_dict[i]['Source']))
          print("Target: {}".format(sentences_dict[i]['Target']))
          print("Predicted: {}".format(sentences_dict[i]['Predicted']))
          print('\n\n')
          break
        tensorboard_logs = {'avg_val_bleu': avg_bleu}
        
        return {'avg_val_bleu': avg_bleu, 'progress_bar': tensorboard_logs, 'log' : tensorboard_logs}

    def test_epoch_end(self, outputs):
        avg_bleu = sum([x['test_bleu'] for x in outputs]) / len(outputs)
        sentences_dict = [x['sentences'] for x in outputs]

        tensorboard_logs = {'avg_test_bleu': avg_bleu}
        for i in range(3):
          print('\n\n')
          print("Source:{}".format(sentences_dict[i]['Source']))
          print("Target: {}".format(sentences_dict[i]['Target']))
          print("Predicted: {}".format(sentences_dict[i]['Predicted']))
          print('\n\n')
          break
        return {'avg_test_bleu': avg_bleu, 'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.Adam(
            [p for p in self.parameters() if p.requires_grad],
            lr=self.learning_rate, eps=1e-08)

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

    def test_dataloader(self):
        return self._test_dataloader

# Carregando dados

In [None]:
! wget -nc https://storage.googleapis.com/neuralresearcher_data/unicamp/ia376e_2020s1/paracrawl_enpt_train.tsv.gz
! wget -nc https://storage.googleapis.com/neuralresearcher_data/unicamp/ia376e_2020s1/paracrawl_enpt_test.tsv.gz

File ‘paracrawl_enpt_train.tsv.gz’ already there; not retrieving.

File ‘paracrawl_enpt_test.tsv.gz’ already there; not retrieving.



In [None]:
def load_text_pairs(path):
    text_pairs = []
    for line in gzip.open(path, mode='rt'):
        text_pairs.append(line.strip().split('\t'))
    return text_pairs

x_train = load_text_pairs('paracrawl_enpt_train.tsv.gz')
x_test = load_text_pairs('paracrawl_enpt_test.tsv.gz')

# Embaralhamos o treino para depois fazermos a divisão treino/val.
random.shuffle(x_train)

# Truncamos o dataset para 100k pares de treino e 5k pares de validação.
x_val = x_train[100000:105000]
x_train = x_train[:100000]

for set_name, x in [('treino', x_train), ('validação', x_val), ('test', x_test)]:
    print(f'\n{len(x)} amostras de {set_name}')
    print(f'3 primeiras amostras {set_name}:')
    for i, (source, target) in enumerate(x[:3]):
        print(f'{i}: source: {source}\n   target: {target}')


100000 amostras de treino
3 primeiras amostras treino:
0: source: More Croatian words and phrases
   target: Mais palavras e frases em croata
1: source: Jerseys and pullovers, containing at least 50Â % by weight of wool and weighing 600Â g or more per article 6110 11 10 (PCE)
   target: Camisolas e pulôveres, com pelo menos 50 %, em peso, de lã e pesando 600g ou mais por unidade 6110 11 10 (PCE)
2: source: Atex Colombia SAS makes available its lead product, 100% natural liquid latex, excellent quality and price. ... Welding manizales caldas Colombia a DuckDuckGo
   target: Atex Colômbia SAS torna principal produto está disponível, látex líquido 100% natural, excelente qualidade e preço. ...

5000 amostras de validação
3 primeiras amostras validação:
0: source: «You have hidden these things from the wise and the learned you have revealed them to the childlike»
   target: «Escondeste estas coisas aos sábios e entendidos e as revelaste aos pequenos»
1: source: Repair of computers, applic

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_name)


class MyDataset(Dataset):
    def __init__(self, text_pairs: List[Tuple[str]], tokenizer,
                 source_max_length: int = 32, target_max_length: int = 32):
        self.tokenizer = tokenizer
        self.text_pairs = text_pairs
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        self.tokenizer_eos = self.tokenizer.eos_token
        
    def __len__(self):
        return len(self.text_pairs)
    
    def __getitem__(self, idx):
        source, target = self.text_pairs[idx]

        original_source = source
        original_target = target

        source = 'translate English to Portuguese: ' + source + ' ' + self.tokenizer_eos

        source_dict = tokenizer.encode_plus(source, max_length = self.source_max_length, pad_to_max_length = True)
        target_dict = tokenizer.encode_plus(target, max_length = self.target_max_length, pad_to_max_length = True)

        source_token_ids = torch.tensor(source_dict['input_ids'])
        source_mask = torch.tensor(source_dict['attention_mask'])
        target_token_ids = torch.tensor(target_dict['input_ids'])
        target_mask = torch.tensor(target_dict['attention_mask'])

        return (source_token_ids, source_mask, target_token_ids, target_mask,
                original_source, original_target)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f


In [None]:
dataset_train = MyDataset(text_pairs=x_train,
                          tokenizer=tokenizer,
                          source_max_length=source_max_length,
                          target_max_length=target_max_length)

dataset_val = MyDataset(text_pairs=x_val,
                        tokenizer=tokenizer,
                        source_max_length=source_max_length,
                        target_max_length=target_max_length)

dataset_test = MyDataset(text_pairs=x_test,
                         tokenizer=tokenizer,
                         source_max_length=source_max_length,
                         target_max_length=target_max_length)

train_dataloader = DataLoader(dataset_train, batch_size=batch_size,
                              shuffle=True, num_workers=4)

val_dataloader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False, 
                            num_workers=4)

test_dataloader = DataLoader(dataset_test, batch_size=batch_size,
                             shuffle=False, num_workers=4)

# Overfit em 1 Batch

In [None]:
trainer = pl.Trainer(gpus = 1, 
                    max_epochs=10,
                    check_val_every_n_epoch=1,
                    checkpoint_callback=False,  # Disable checkpoint saving
                    overfit_pct=0.005, 
                     weights_summary=None)

# Dataset usando apenas um batch de amostras de treino.
dataset_debug = MyDataset(text_pairs=x_train,
                          tokenizer=tokenizer,
                          source_max_length=source_max_length,
                          target_max_length=target_max_length)

debug_dataloader = DataLoader(dataset_debug, batch_size=batch_size,
                              shuffle=False, num_workers=4)

model = T5Finetuner(tokenizer=tokenizer,
                    train_dataloader=debug_dataloader,
                    val_dataloader=debug_dataloader,
                    test_dataloader=None,
                    learning_rate=learning_rate,
                    custom_decoding=True, 
                    mode_decoding= 'nucleus')

trainer.fit(model)
del model  # Para não ter estouro de mémoria da GPU

In [None]:
max_epochs = 4

checkpoint_path = '/content/drive/My Drive/aula9_checkpoints/epoch=3.ckpt'
checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
print(f'Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}')
print(f'Saving checkpoints to {checkpoint_dir}')
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_dir,
                                      save_top_k=-1)  # Keeps all checkpoints.

resume_from_checkpoint = None
if os.path.exists(checkpoint_path):
    print(f'Restoring checkpoint: {checkpoint_path}')
    resume_from_checkpoint = checkpoint_path

trainer = pl.Trainer(gpus=1,
                    max_epochs=max_epochs,
                    check_val_every_n_epoch=1,
                    profiler=True,
                    accumulate_grad_batches=accumulate_grad_batches,
                    checkpoint_callback=checkpoint_callback,
                    progress_bar_refresh_rate=10,
                    resume_from_checkpoint=resume_from_checkpoint,
                      weights_summary=None)

model = T5Finetuner(tokenizer=tokenizer,
                    train_dataloader=train_dataloader,
                    val_dataloader=val_dataloader,
                    test_dataloader=test_dataloader,
                    learning_rate=learning_rate,
                    custom_decoding = False,
                    mode_decoding  = 'greedy'
                    )

INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


Files in /content/drive/My Drive/aula9_checkpoints: ['epoch=0.ckpt', 'epoch=0_v0.ckpt', 'epoch=1.ckpt', 'epoch=0_v1.ckpt', 'epoch=1_v0.ckpt', 'epoch=0_v2.ckpt', 'epoch=1_v1.ckpt', 'epoch=0_v3.ckpt', 'epoch=1_v2.ckpt', 'epoch=2.ckpt', 'epoch=3.ckpt']
Saving checkpoints to /content/drive/My Drive/aula9_checkpoints
Restoring checkpoint: /content/drive/My Drive/aula9_checkpoints/epoch=3.ckpt


INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json from cache at /root/.cache/torch/transformers/26561bc9e840d8945f475d0d4c4b9df32025eadd79894b867b570cb1d09e67a9.3817cc1260a6b941b17af62b4f2a942b9825f209d8e2eed99e79e96f85f59aab
INFO:transformers.configuration_utils:Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
    

In [None]:
modes = ['greedy','topk','nucleus']
custom_decoding = [True, False]

In [None]:
for mode in modes:
  for custom_flag in custom_decoding:
    if custom_flag:
      print("Custom Decoding: {}".format(mode))
    else:
      print("Native Decoding: {}".format(mode))
    model.custom_decoding = custom_flag
    model.mode_decoding = mode
    trainer.test(model)


Custom Decoding: greedy


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…




Source:In this way, the civil life of a nation matures, making it possible for all citizens to enjoy the fruits of genuine tolerance and mutual respect.
Target: Deste modo, a vida civil de uma nação amadurece, fazendo com que todos os cidadãos gozem dos frutos da tolerância genuína e do respeito mútuo.
Predicted: : Assim, a vida civil de uma naç ⁇ o madura, tornando poss



--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_bleu': 14.20403907865615}
--------------------------------------------------------------------------------

Native Decoding: greedy


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…




Source:In this way, the civil life of a nation matures, making it possible for all citizens to enjoy the fruits of genuine tolerance and mutual respect.
Target: Deste modo, a vida civil de uma nação amadurece, fazendo com que todos os cidadãos gozem dos frutos da tolerância genuína e do respeito mútuo.
Predicted: : Assim, a vida civil de uma naç ⁇ o madura, tornando pos



--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_bleu': 13.654318474953346}
--------------------------------------------------------------------------------

Custom Decoding: topk


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…






Source:In this way, the civil life of a nation matures, making it possible for all citizens to enjoy the fruits of genuine tolerance and mutual respect.
Target: Deste modo, a vida civil de uma nação amadurece, fazendo com que todos os cidadãos gozem dos frutos da tolerância genuína e do respeito mútuo.
Predicted: : Assim a vida civil de um mundo madura e tornando a todos 



--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_bleu': 11.080831520151133}
--------------------------------------------------------------------------------

Native Decoding: topk


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…




Source:In this way, the civil life of a nation matures, making it possible for all citizens to enjoy the fruits of genuine tolerance and mutual respect.
Target: Deste modo, a vida civil de uma nação amadurece, fazendo com que todos os cidadãos gozem dos frutos da tolerância genuína e do respeito mútuo.
Predicted: : Assim se madria a vida civil de uma naç ⁇ o, tornando



--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_bleu': 10.626852864767901}
--------------------------------------------------------------------------------

Custom Decoding: nucleus


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…






Source:In this way, the civil life of a nation matures, making it possible for all citizens to enjoy the fruits of genuine tolerance and mutual respect.
Target: Deste modo, a vida civil de uma nação amadurece, fazendo com que todos os cidadãos gozem dos frutos da tolerância genuína e do respeito mútuo.
Predicted: : Assim a vida civil de uma naç ⁇ o se matura, tornando poss



--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_bleu': 11.762992210742995}
--------------------------------------------------------------------------------

Native Decoding: nucleus


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…




Source:In this way, the civil life of a nation matures, making it possible for all citizens to enjoy the fruits of genuine tolerance and mutual respect.
Target: Deste modo, a vida civil de uma nação amadurece, fazendo com que todos os cidadãos gozem dos frutos da tolerância genuína e do respeito mútuo.
Predicted: : Tal modo, a vida civil de uma naç ⁇ o for dou por tornar també



--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_bleu': 10.648199005008646}
--------------------------------------------------------------------------------

