# Inicialização das variáveis

In [None]:
RESUME_FROM_CHECKPOINT=None

inserir_beginoftext_token = True # Inserir um token '<|target_bos|>' separando o prompt da resposta nos modelos tipo GPT

# Pode ser necessário um valor maior para tarefas de sumarização
MAX_TOKEN_GENERATION_LENGTH=60

output_dir="./fine-tuned-model"
## É necessário especificar o tipo de arquitetura
# model_type='decoder'
# model_type='encoder-decoder'

epochs = 2
dropout_rate=0.1
BATCH_SIZE = 64
EVAL_BATCH_SIZE=64

if not RESUME_FROM_CHECKPOINT:
    transformer_model_name='tgsc/ult5-pt-small'; model_type='encoder-decoder'; dropout_rate=0.0; BATCH_SIZE = 64; EVAL_BATCH_SIZE=64; #prefix_input='<|NLU|>' # <|NLG|>

    # transformer_model_name='pierreguillou/gpt2-small-portuguese'; model_type='decoder'; BATCH_SIZE = 16; EVAL_BATCH_SIZE=16

    # transformer_model_name='unicamp-dl/ptt5-small-portuguese-vocab'; model_type='encoder-decoder'; BATCH_SIZE = 64; EVAL_BATCH_SIZE=64;
    # transformer_model_name='unicamp-dl/ptt5-base-portuguese-vocab'; model_type='encoder-decoder'; BATCH_SIZE = 10; EVAL_BATCH_SIZE=10
    # transformer_model_name='unicamp-dl/ptt5-large-portuguese-vocab'; model_type='encoder-decoder'; BATCH_SIZE = 5; EVAL_BATCH_SIZE=5    

gradient_accumulation_steps = int(round(128//BATCH_SIZE))

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

import multiprocessing

num_proc = multiprocessing.cpu_count()
print('cpu_count:',num_proc)

In [None]:
!pip install transformers datasets accelerate
!pip install sentencepiece
!pip install evaluate
!pip install rouge_score

# Carrega o tokenizer

In [None]:
import transformers as transformers
from transformers import AutoTokenizer, AutoConfig, T5Tokenizer

# Carrega o tokenizer
tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)

# Nos modelos decoder, adicionaremos um token separando a entrada da resposta, para o modelo identificar que é para fazer a tarefa
if model_type=='decoder':
    tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
    # tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    if inserir_beginoftext_token:
        target_bos_token='<|target_bos|>'
        tokenizer.add_special_tokens({ "additional_special_tokens": [target_bos_token] })

# # Descomentar para adicionar tokens no vocabulário do tokenizer para virar múltiplo de 8 (recomendação NVIDIA para uso dos Tensor Cores com fp16)
# i = 0
# while len(tokenizer) % 8 !=0 :
#     tokenizer.add_special_tokens({ "additional_special_tokens": [f"<|vocab_pad_{i}"] })
#     i+=1

# Carrega o modelo

In [None]:
import transformers
from transformers import AutoModel, AutoModelForSeq2SeqLM, AutoModelForCausalLM, BertLMHeadModel
import torch

if model_type=='encoder-decoder':
    model = AutoModelForSeq2SeqLM.from_pretrained(transformer_model_name,dropout_rate=dropout_rate)
elif model_type=='decoder':
    model = AutoModelForCausalLM.from_pretrained(transformer_model_name)
else:
    raise ValueError('tipo de arquitetura deve ser "decoder" ou "encoder-decoder')

# Resize no modelo para o novo vocabulário
model.resize_token_embeddings(len(tokenizer))
model.max_length=MAX_TOKEN_GENERATION_LENGTH

print(model.config)
model_size = sum(t.numel() for t in model.parameters())
try:
    context_length=model.config.n_positions
except:
    context_length=model.config.max_position_embeddings
print(f"Model size: {model_size/1000**2:.1f}M parameters")

# Carrega os datasets e cria as métricas

Para adicionar o dataset ao benchmark, basta descomentar o bloco (atalho CTRL + ";" )

Datasets

```
Perguntas e respostas: SQUAD
Correção semântica/gramatical: COLA
Implicações lógicas: ASSIN 2, RTE, SCITAIL, MLNI, SNLI
Paráfrase: MRPC e QQP
Score de similaridade: ASSIN 2, STSB
Classificação de senimentos: SST2
Sumarização: WiKI LINGUA e XLSUM
```

Grupos recomendados para fine-tune em conjunto (devido ao grande tamanho de alguns datasets)
```
[ASSIN 2, COLA, MRPC, RTE, STSB], SCITAIL opcionalmente
[SQUAD V1.1]
[WIKI LINGUA, XLSUM]
[SNLI,MLNI]
```

In [None]:
ds = {}
ds_processado = {}
ds_tokenizado = {}

# cria um dicionário dataset para poder identificar qual o benchmark durante o cálculo das métricas
identificador_de_dataset = {}
metric_functions = {}
map_functions = {}

# o contador identificará qual o dataset no data_collator do modelo
if model_type=="encoder-decoder":
    contador= 10_000
elif model_type=='decoder':
    contador=len(tokenizer) + 10_000

def criar_identificador():
    global contador
    contador += 1
    return contador

## ASSIN 2

https://sites.google.com/view/assin2

https://huggingface.co/datasets/assin2

### Métrica calculada pelo código do script de avaliação
https://github.com/erickrf/assin

In [None]:
# Código ajustado de https://github.com/erickrf/assin

import numpy as np
from sklearn.metrics import f1_score
from scipy.stats import pearsonr

class Pair(object):
    '''
    Class representing a pair of texts from SICK or RTE.
    It is meant to be used as an abstract representation for both.
    '''
    def __init__(self, entailment, similarity):
    # def __init__(self, t, h, id_, entailment, similarity):
        '''
        :param t: string with the text
        :param h: string with the hypothesis
        :param id_: int indicating id in the original file
        :param entailment: int indicating entailment class
        :param similarity: float
        '''
        # self.t = t
        # self.h = h
        # self.id = id_
        self.entailment = entailment
        self.similarity = similarity

def eval_rte(pairs_gold, pairs_sys):
    '''
    Evaluate the RTE output of the system against a gold score. 
    Results are printed to stdout.
    '''
    # check if there is an entailment value
    if pairs_sys[0].entailment is None:
        print()
        print('No RTE output to evaluate')
        return
    
    gold_values = np.array([p.entailment for p in pairs_gold])
    sys_values = np.array([p.entailment for p in pairs_sys])
    label_set = set(gold_values)
    macro_f1 = f1_score(gold_values, sys_values, average='macro', 
                        labels=list(label_set))
    accuracy = (gold_values == sys_values).sum() / len(gold_values)
    
    return accuracy, macro_f1
    # print()
    # print('RTE evaluation')
    # print('Accuracy\tMacro F1')
    # print('--------\t--------')
    # print('{:8.2%}\t{:8.3f}'.format(accuracy, macro_f1))

def eval_similarity(pairs_gold, pairs_sys):
    '''
    Evaluate the semantic similarity output of the system against a gold score. 
    Results are printed to stdout.
    '''
    # check if there is an entailment value
    if pairs_sys[0].similarity is None:
        print()
        print('No similarity output to evaluate')
        return
    
    gold_values = np.array([p.similarity for p in pairs_gold])
    sys_values = np.array([p.similarity for p in pairs_sys])
    pearson = pearsonr(gold_values, sys_values)[0]
    absolute_diff = gold_values - sys_values
    mse = (absolute_diff ** 2).mean()
    
    return pearson, mse
    # print()
    # print('Similarity evaluation')
    # print('Pearson\t\tMean Squared Error')
    # print('-------\t\t------------------')
    # print('{:7.3f}\t\t{:18.2f}'.format(pearson, mse))

### Similarity score

In [None]:
import re
from evaluate import load

def assin2_score_metric(predictions,labels):
    pairs_labels = []
    pairs_preds = []

    for i in range(0,len(predictions)):
        # Busca o número na string gerada
        pred_numbers = re.findall(r"[-+]?(?:\d*\.*\d+)",predictions[i])
        
        # Caso não haja número, atribuir valor máximo e mínimo para penalizar o erro
        if len(pred_numbers)==0:
            predictions[i]=float(0)
            labels[i]=float(5)
        else:
            try:
                predictions[i] = float(pred_numbers[0])
                labels[i] = float(labels[i])
            except:
                predictions[i]=float(0)
                labels[i]=float(5)

        if predictions[i]>5:
            predictions[i]=5
        elif predictions[i]<0:
            predictions[i]=0

        pairs_labels.append(Pair( 0, labels[i]))
        pairs_preds.append(Pair( 0, predictions[i]))

    pearson, mse = eval_similarity(pairs_labels, pairs_preds)

    return {'assin2_score_pearson': pearson, 'assin2_score_mse': mse}

def assin2_score_map_fc(examples):
    new_examples = { 'text':[], 'labels':[]}
   
    first_key=list(examples.keys())[0]
    for i in range(0,len(examples[first_key])):
        input=f'assin2_similaridade premissa: {examples["premise"][i]}'
        input+=f' hipótese: {examples["hypothesis"][i]}'

        if examples['relatedness_score'][i] == -1:
            continue
        else:
            label = ' ' + str(examples['relatedness_score'][i])
            # label = 'pontuação de similaridade: ' + str(examples['label'][i])

        if model_type=='decoder':
            if inserir_beginoftext_token:
                input += target_bos_token
            else:
                label = ' pontuação de similaridade: ' + label
        label += tokenizer.eos_token

        new_examples['text'].append(input)
        new_examples['labels'].append(label)

    return new_examples

import datasets
ds['assin2_score'] = datasets.load_dataset("assin2",cache_dir="./cache")
# No caso do assin2, substituir o split de validação pelo de teste
ds['assin2_score']['validation']=ds['assin2_score']['test']
del ds['assin2_score']['test']

identificador_de_dataset['assin2_score']=criar_identificador()
metric_functions['assin2_score'] = assin2_score_metric
map_functions['assin2_score'] = assin2_score_map_fc

new_features = ds['assin2_score']['train'].features.copy()
new_features['relatedness_score'] = datasets.Value(dtype='string', id=None)
ds['assin2_score'] = ds['assin2_score'].cast(new_features)

ds['assin2_score']

### Entailment

In [None]:
from evaluate import load

def assin2_entail_metric(predictions,labels):

    pairs_labels = []
    pairs_preds = []

    for i in range(0,len(labels)):
        if labels[i]=='Não é implicação lógica':
            labels[i]=0
        elif labels[i]=='É implicação lógica':
            labels[i]=1

        if predictions[i]=='Não é implicação lógica':
            predictions[i]=0
        elif predictions[i]=='É implicação lógica':
            predictions[i]=1
        else: 
            # Como a predição é errada, colocamos o contrário do label
            predictions[i]=0
            if labels[i]==0:
                predictions[i]=1

        pairs_labels.append(Pair( labels[i], 0))
        pairs_preds.append(Pair( predictions[i], 0))

    accuracy, macro_f1 = eval_rte(pairs_labels, pairs_preds)

    return {'assin2_entail_acc': accuracy, 'assin2_entail_f1' : macro_f1}


def assin2_entail_map_fc(examples):
    new_examples = { 'text':[], 'labels':[]}
   
    first_key=list(examples.keys())[0]
    for i in range(0,len(examples[first_key])):
        input=f'assin2_entail premissa: {examples["premise"][i]}'
        input+=f' hipótese: {examples["hypothesis"][i]}'
        if examples['entailment_judgment'][i] == 0:
            label = 'Não é implicação lógica'
        elif examples['entailment_judgment'][i] == 1:
            label = 'É implicação lógica'

        if model_type=='decoder':
            if inserir_beginoftext_token:
                input += target_bos_token
            else:
                label = ' julgamento: ' + label
        label += tokenizer.eos_token

        new_examples['text'].append(input)
        new_examples['labels'].append(label)

    return new_examples

import datasets
ds['assin2_entail'] = datasets.load_dataset("assin2",cache_dir="./cache")

# No caso do assin2, substituir o split de validação pelo de teste
ds['assin2_entail']['validation']=ds['assin2_entail']['test']
del ds['assin2_entail']['test']

identificador_de_dataset['assin2_entail']=criar_identificador()
metric_functions['assin2_entail'] = assin2_entail_metric
map_functions['assin2_entail'] = assin2_entail_map_fc
ds['assin2_entail']

## SQUAD v1.1

Traduzido pelo grupo Deep Learning Brasil (http://www.deeplearningbrasil.com.br/)

Train: 87510 exemplos, Validation: 10570 exemplos

In [None]:
# from evaluate import load

# import re
# import string
# def normalize_answer(s):
#     # https://github.com/huggingface/evaluate/blob/main/metrics/squad/compute_score.py
#     """Lower text and remove punctuation, articles and extra whitespace."""

#     def remove_articles(text):
#         return re.sub(r"\b(o|a|os|as|um|uns|uma|umas)\b", " ", text)

#     def white_space_fix(text):
#         return " ".join(text.split())

#     def remove_punc(text):
#         exclude = set(string.punctuation)
#         return "".join(ch for ch in text if ch not in exclude)

#     def lower(text):
#         return text.lower()

#     return white_space_fix(remove_articles(remove_punc(lower(s))))

# def squad_v11_metric(predictions,labels):

#     for i in range(0,len(predictions)):
#       predictions[i] = normalize_answer(predictions[i])

#       predictions[i] = { 'prediction_text' : predictions[i] , 'id': labels[i]}
#       labels[i] = { 'answers': { 'text' : ds_squad_val_dict[labels[i]]['text'], 'answer_start': ds_squad_val_dict[labels[i]]['answer_start']} , 'id': labels[i]}

#       for j in range(0,len(labels[i]['answers']['text'])):
#           labels[i]['answers']['text'][j] = normalize_answer(labels[i]['answers']['text'][j])

#     squad_metric = load("squad")
#     result = squad_metric.compute(predictions=predictions, references=labels)

#     return {'squad_v1.1_acc': result['exact_match'], 'squad_v1.1_f1': result['f1']}

# def squad_v11_map_fc(examples):    
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
    
#     for i in range(0,len(examples[first_key])):
#         input=f'squad Contexto: {examples["context"][i]}'
#         input+=f' Pergunta: {examples["question"][i]}'
#         # input+= ' A transcrição exata do trecho do contexto que responde a pergunta é:'

#         label = examples['answers'][i]['text'][0]

#         if model_type=='decoder':
#             if inserir_beginoftext_token:
#                 input += target_bos_token
#             else:
#                 label = ' Resposta: ' + label

#         label += tokenizer.eos_token

#         new_examples['text'].append(input)
#         new_examples['labels'].append(label)        
    
#     return new_examples

# import datasets

# def squad_v11_map_validation_fc(examples):    
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
    
#     for i in range(0,len(examples[first_key])):
#         input=f'squad Contexto: {examples["context"][i]}'
#         input+=f' Pergunta: {examples["question"][i]}'
#         # input+= ' A transcrição exata do trecho do contexto que responde a pergunta é:'

#         label = examples['id'][i]

#         new_examples['text'].append(input)
#         new_examples['labels'].append(label)        
    
#     return new_examples

# import datasets

# ds['squad_v1.1'] = datasets.load_dataset("tgsc/squad-pt-v1.1",cache_dir="./cache")
# ds_squad_val = ds['squad_v1.1']['validation']
# ds_squad_val_dict = {} # dicionário para a avaliação do squad (os exemplos contêm lista de respostas)
# for i in range(len(ds_squad_val)):
#     ds_squad_val_dict[ds_squad_val[i]['id']]=ds_squad_val[i]['answers']

# identificador_de_dataset['squad_v1.1']=criar_identificador()
# metric_functions['squad_v1.1'] = squad_v11_metric
# map_functions['squad_v1.1'] = {'train': squad_v11_map_fc, 'validation': squad_v11_map_validation_fc }

# ds['squad_v1.1']

## PLUE - GLUE em português

Versão em português

https://huggingface.co/datasets/dlb/plue

https://github.com/jubs12/PLUE

### COLA

Train: 8551 exemplos, Validation: 1043 exemplos


In [None]:
from evaluate import load

def cola_metric(predictions,labels):
    exact_match_metric = load("exact_match")
    result_acc = exact_match_metric.compute(predictions=predictions,references=labels)


    glue_metric = load('glue', 'cola')

    for i in range(0,len(labels)):
        if labels[i]=='Não é gramaticalmente aceitável':
            labels[i]=0
        elif labels[i]=='Gramaticalmente correto':
            labels[i]=1

        if predictions[i]=='Não é gramaticalmente aceitável':
            predictions[i]=0
        elif predictions[i]=='Gramaticalmente correto':
            predictions[i]=1
        else: 
            # Como a predição é errada, colocamos o contrário do label
            predictions[i]=0
            if labels[i]==0:
                predictions[i]=1

    results = glue_metric.compute(predictions=predictions, references=labels)

    return {'cola_acc': result_acc['exact_match'], 'cola_matthews_corr': results['matthews_correlation']}

def cola_map_fc(examples):    
    new_examples = { 'text':[], 'labels':[]}
   
    first_key=list(examples.keys())[0]
    
    for i in range(0,len(examples[first_key])):
        input=f'cola sentença: {examples["sentence"][i]}'
        if examples['label'][i] == 0:
            label = 'Não é gramaticalmente aceitável'
        elif examples['label'][i] == 1:
            label = 'Gramaticalmente correto'
        elif examples['label'][i] == -1:
            continue


        if model_type=='decoder':
            if inserir_beginoftext_token:
                input += target_bos_token
            else:
                label = ' classe: ' + label

        label += tokenizer.eos_token

        new_examples['text'].append(input)
        new_examples['labels'].append(label)
    
    return new_examples

import datasets


ds['cola'] = datasets.load_dataset("dlb/plue","cola",cache_dir="./cache")

identificador_de_dataset['cola']=criar_identificador()
metric_functions['cola'] = cola_metric
map_functions['cola'] = cola_map_fc
ds['cola']

### MLNI

Matched
Train: 392702 exemplos, Validation: 9815 exemplos

Mistmatched
Validation: 9832 exemplos

In [None]:
# from evaluate import load

# def mnli_matched_metric(predictions,labels):
#     exact_match_metric = load("exact_match")
#     result = exact_match_metric.compute(predictions=predictions,references=labels)

#     return {'mnli_matched_acc': result['exact_match']}

# def mnli_mismatched_metric(predictions,labels):
#     exact_match_metric = load("exact_match")
#     result = exact_match_metric.compute(predictions=predictions,references=labels)

#     return {'mnli_mismatched_acc': result['exact_match']}

# def mnli_map_fc(examples):
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
#     for i in range(0,len(examples[first_key])):
#         input=f'smnli Premissa: {examples["premise"][i]}'
#         input+=f' Hipótese: {examples["hypothesis"][i]}'
#         if examples['label'][i] == 0:
#             label = 'implicação lógica'
#         elif examples['label'][i] == 1:
#             label = 'neutro'
#         elif examples['label'][i] == 2:
#             label = 'contradição'
#         elif examples['label'][i] == -1:
#             # Este caso é o set de teste
#             continue

#         if model_type=='decoder':
#             if inserir_beginoftext_token:
#                 input += target_bos_token
#             else:
#                 label = ' classe: ' + label

#         label += tokenizer.eos_token

#         new_examples['text'].append(input)
#         new_examples['labels'].append(label)

#     return new_examples

# import datasets
# ds_mlni = datasets.load_dataset("dlb/plue","mnli",cache_dir="./cache")

# ds['mnli_matched'] = datasets.DatasetDict({ 'train' : ds_mlni['train'], 'validation' : ds_mlni['validation_matched'],'test': ds_mlni['test_matched']})
# ds['mnli_mismatched'] = datasets.DatasetDict({ 'validation' : ds_mlni['validation_mismatched'],'test': ds_mlni['test_mismatched']})
# del ds_mlni

# identificador_de_dataset['mnli_matched']=criar_identificador()
# metric_functions['mnli_matched'] = mnli_matched_metric
# map_functions['mnli_matched'] = mnli_map_fc

# identificador_de_dataset['mnli_mismatched']=criar_identificador()
# metric_functions['mnli_mismatched'] = mnli_mismatched_metric
# map_functions['mnli_mismatched'] = mnli_map_fc
# print('mnli_matched')
# print(ds['mnli_matched'])
# print('mnli_mismatched')
# print(ds['mnli_mismatched'])

### MRPC - Microsoft Research Paraphrase Corpus

Microsoft Research Paraphrase Corpus (MRPC) is a corpus consists of sentence pairs collected from newswire articles. Each pair is labelled if it is a paraphrase or not by human annotators.

https://paperswithcode.com/dataset/mrpc

Train: 3668 exemplos, Validation: 408 exemplos

In [None]:
from evaluate import load

def mrpc_metric(predictions,labels):
    exact_match_metric = load("exact_match")
    result = exact_match_metric.compute(predictions=predictions,references=labels)

    return {'mrpc_acc': result['exact_match']}

def mrpc_map_fc(examples):
    new_examples = { 'text':[], 'labels':[]}
   
    first_key=list(examples.keys())[0]
    for i in range(0,len(examples[first_key])):
        input=f'mrpc As sentenças seguintes são similares ou diferentes? sentença 1: {examples["sentence1"][i]}'
        input+=f' sentença 2: {examples["sentence2"][i]}'
        if examples['label'][i] == 0:
            label = 'diferentes'
        elif examples['label'][i] == 1:
            label = 'similares'
        elif examples['label'][i] == -1:
            # Este caso é o set de teste
            continue

        if model_type=='decoder':
            if inserir_beginoftext_token:
                input += target_bos_token
            # else:
            #     label = ' comparação das sentenças: ' + label
        label += tokenizer.eos_token

        new_examples['text'].append(input)
        new_examples['labels'].append(label)

    return new_examples

import datasets
ds['mrpc'] = datasets.load_dataset("dlb/plue","mrpc",cache_dir="./cache")

identificador_de_dataset['mrpc']=criar_identificador()
metric_functions['mrpc'] = mrpc_metric
map_functions['mrpc'] = mrpc_map_fc
ds['mrpc']

### QNLI_v2 - Question-answering NLI

The QNLI (Question-answering NLI) dataset is a Natural Language Inference dataset automatically derived from the Stanford Question Answering Dataset v1.1 (SQuAD). SQuAD v1.1 consists of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question (written by an annotator). The dataset was converted into sentence pair classification by forming a pair between each question and each sentence in the corresponding context, and filtering out pairs with low lexical overlap between the question and the context sentence. The task is to determine whether the context sentence contains the answer to the question. This modified version of the original task removes the requirement that the model select the exact answer, but also removes the simplifying assumptions that the answer is always present in the input and that lexical overlap is a reliable cue. The QNLI dataset is part of GLEU benchmark.

https://paperswithcode.com/dataset/qnli

Train: 104743 exemplos, Validation: 5463 exemplos

In [None]:
# from evaluate import load

# def qnli_v2_metric(predictions,labels):
#     exact_match_metric = load("exact_match")
#     result = exact_match_metric.compute(predictions=predictions,references=labels)

#     return {'qnli_v2_acc': result['exact_match']}

# def qnli_v2_map_fc(examples):
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
#     for i in range(0,len(examples[first_key])):
#         input=f'qnli Pergunta: {examples["question"][i]}'
#         input+=f' Resposta: {examples["sentence"][i]}'
#         if examples['label'][i] == 0:
#             label = 'Implicação'
#         elif examples['label'][i] == 1:
#             label = 'Não relacionadas'
#         elif examples['label'][i] == -1:
#             # Este caso é o set de teste
#             continue

#         if model_type=='decoder':
#             if inserir_beginoftext_token:
#                 input += target_bos_token
#             else:
#                 label = ' classe: ' + label
#         label += tokenizer.eos_token

#         new_examples['text'].append(input)
#         new_examples['labels'].append(label)

#     return new_examples

# import datasets
# ds['qnli_v2'] = datasets.load_dataset("dlb/plue","qnli_v2",cache_dir="./cache")

# identificador_de_dataset['qnli_v2']=criar_identificador()
# metric_functions['qnli_v2'] = qnli_v2_metric
# map_functions['qnli_v2'] = qnli_v2_map_fc
# ds['qnli_v2']

### QQP_v2 - Quora Question Pairs
Quora Question Pairs (QQP) dataset consists of over 400,000 question pairs, and each question pair is annotated with a binary value indicating whether the two questions are paraphrase of each other.

Train: 363846 exemplos, Validation: 40430 exemplos

https://paperswithcode.com/dataset/quora-question-pairs

In [None]:
# from evaluate import load

# def qqp_metric(predictions,labels):
#     exact_match_metric = load("exact_match")
#     result = exact_match_metric.compute(predictions=predictions,references=labels)

#     return {'qqp_acc': result['exact_match']}

# def qqp_map_fc(examples):
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
#     for i in range(0,len(examples[first_key])):
#         input=f'qqp Pergunta 1: {examples["question1"][i]}'
#         input+=f' Pergunta 2: {examples["question2"][i]}'
#         if examples['label'][i] == 1:
#             label = 'Não são perguntas duplicadas'
#         elif examples['label'][i] == 0:
#             label = 'São perguntas equivalentes'
#         elif examples['label'][i] == -1:
#             # Este caso é o set de teste
#             continue

#         if model_type=='decoder':
#             if inserir_beginoftext_token:
#                 input += target_bos_token
#             else:
#                 label = ' classe: ' + label
#         label += tokenizer.eos_token

#         new_examples['text'].append(input)
#         new_examples['labels'].append(label)

#     return new_examples

# import datasets
# ds['qqp'] = datasets.load_dataset("dlb/plue","qqp_v2",cache_dir="./cache")

# identificador_de_dataset['qqp']=criar_identificador()
# metric_functions['qqp'] = qqp_metric
# map_functions['qqp'] = qqp_map_fc
# ds['qqp']

### RTE

Train: 2490 exemplos, Validation: 277 exemplos

In [None]:
from evaluate import load

def rte_metric(predictions,labels):
    exact_match_metric = load("exact_match")
    result = exact_match_metric.compute(predictions=predictions,references=labels)

    return {'rte_acc': result['exact_match']}

def rte_map_fc(examples):
    new_examples = { 'text':[], 'labels':[]}
   
    first_key=list(examples.keys())[0]
    for i in range(0,len(examples[first_key])):
        input=f'rte sentença 1: {examples["sentence1"][i]}'
        input+=f' sentença 2: {examples["sentence2"][i]}'
        if examples['label'][i] == 0:
            label = 'É implicação lógica'
        elif examples['label'][i] == 1:
            label = 'Não é implicação lógica'
        elif examples['label'][i] == -1:
            # Este caso é o set de teste
            continue

        if model_type=='decoder':
            if inserir_beginoftext_token:
                input += target_bos_token
            else:
                label = ' conclusão: ' + label
        label += tokenizer.eos_token

        new_examples['text'].append(input)
        new_examples['labels'].append(label)

    return new_examples

import datasets
ds['rte'] = datasets.load_dataset("dlb/plue","rte",cache_dir="./cache")

identificador_de_dataset['rte']=criar_identificador()
metric_functions['rte'] = rte_metric
map_functions['rte'] = rte_map_fc
ds['rte']

### SCITAIL

The SciTail dataset is an entailment dataset created from multiple-choice science exams and web sentences. Each question and the correct answer choice are converted into an assertive statement to form the hypothesis.

https://allenai.org/data/scitail

Train: 23596 exemplos, Validation: 1304 exemplos

In [None]:
# from evaluate import load

# def scitail_metric(predictions,labels):
#     exact_match_metric = load("exact_match")
#     result = exact_match_metric.compute(predictions=predictions,references=labels)

#     return {'scitail_acc': result['exact_match']}

# def scitail_map_fc(examples):
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
#     for i in range(0,len(examples[first_key])):
#         input=f'scitail premissa: {examples["premise"][i]}'
#         input+=f' hipótese: {examples["hypothesis"][i]}'
#         if examples['label'][i] == 1:
#             label = 'neutro'
#         elif examples['label'][i] == 0:
#             label = 'implicação lógica'
#         else:
#             continue        

#         if model_type=='decoder':
#             if inserir_beginoftext_token:
#                 input += target_bos_token
#             else:
#                 label = ' conclusão: ' + label
#         label += tokenizer.eos_token  

#         new_examples['text'].append(input)
#         new_examples['labels'].append(label)

#     return new_examples

# import datasets
# ds['scitail'] = datasets.load_dataset("dlb/plue","scitail",cache_dir="./cache")

# identificador_de_dataset['scitail']=criar_identificador()
# metric_functions['scitail'] = scitail_metric
# map_functions['scitail'] = scitail_map_fc
# ds['scitail']

### SNLI

Train: 510711 exemplos, Validation: 9831 exemplos

In [None]:
# from evaluate import load

# def snli_metric(predictions,labels):
#     exact_match_metric = load("exact_match")
#     result = exact_match_metric.compute(predictions=predictions,references=labels)

#     return {'snli_acc': result['exact_match']}

# def snli_map_fc(examples):
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
#     for i in range(0,len(examples[first_key])):
#         input=f'smnli Premissa: {examples["premise"][i]}'
#         input+=f' Hipótese: {examples["hypothesis"][i]}'
#         if examples['label'][i] == 0:
#             label = 'implicação lógica'
#         elif examples['label'][i] == 1:
#             label = 'neutro'
#         elif examples['label'][i] == 2:
#             label = 'contradição'              

#         if model_type=='decoder':
#             if inserir_beginoftext_token:
#                 input += target_bos_token
#             else:
#                 label = ' conclusão: ' + label
#         label += tokenizer.eos_token  

#         new_examples['text'].append(input)
#         new_examples['labels'].append(label)

#     return new_examples

# import datasets
# ds['snli'] = datasets.load_dataset("dlb/plue","snli",cache_dir="./cache")

# identificador_de_dataset['snli']=criar_identificador()
# metric_functions['snli'] = snli_metric
# map_functions['snli'] = snli_map_fc
# ds['snli']

### SST2

Train: 67349 exemplos, Validation: 872 exemplos

In [None]:
# from evaluate import load

# def sst2_metric(predictions,labels):
#     exact_match_metric = load("exact_match")
#     result = exact_match_metric.compute(predictions=predictions,references=labels)

#     return {'sst2_acc': result['exact_match']}

# def sst2_map_fc(examples):
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
#     for i in range(0,len(examples[first_key])):
#         input=f'sst2 sentença: {examples["sentence"][i]}'

#         if examples['label'][i] == 1:
#             label = 'positivo'
#         elif examples['label'][i] == 0:
#             label = 'negativo'
#         else:
#             continue

#         if model_type=='decoder':
#             if inserir_beginoftext_token:
#                 input += target_bos_token
#             else:
#                 label = ' sentimento da frase: ' + label
#         label += tokenizer.eos_token

#         new_examples['text'].append(input)
#         new_examples['labels'].append(label)

#     return new_examples

# import datasets
# ds['sst2'] = datasets.load_dataset("dlb/plue","sst2",cache_dir="./cache")

# identificador_de_dataset['sst2']=criar_identificador()
# metric_functions['sst2'] = sst2_metric
# map_functions['sst2'] = sst2_map_fc
# ds['sst2']

### STSB

Train: 5749 exemplos, Validation: 1379 exemplos

In [None]:
import re
from evaluate import load

def stsb_metric(predictions,labels):
    glue_metric = load('glue', 'stsb')

    for i in range(0,len(predictions)):
        # Busca o número na string gerada
        pred_numbers = re.findall(r"[-+]?(?:\d*\.*\d+)",predictions[i])
        
        # Caso não haja número, atribuir valor máximo e mínimo para penalizar o erros
        if len(pred_numbers)==0:
            predictions[i]=float(0)
            labels[i]=float(5)
        else:
            try:
                predictions[i] = float(pred_numbers[0])
                labels[i] = float(labels[i])
            except:
                predictions[i]=float(0)
                labels[i]=float(5)

        if predictions[i]>5:
            predictions[i]=5
        elif predictions[i]<0:
            predictions[i]=0

    result = glue_metric.compute(predictions=predictions, references=labels)

    mse_metric = load("mse")
    mse_results = mse_metric.compute(predictions=predictions, references=labels)

    return {'stsb_pearson': result['pearson'], 'stsb_spearmanr': result['spearmanr'], 'stsb_mse': mse_results['mse']}

def stsb_map_fc(examples):
    new_examples = { 'text':[], 'labels':[]}
   
    first_key=list(examples.keys())[0]
    for i in range(0,len(examples[first_key])):
        input=f'stsb sentença 1: {examples["sentence1"][i]}'
        input+=f' sentença 2: {examples["sentence2"][i]}'

        if examples['label'][i] == -1:
            continue
        else:
            label = ' ' + str(examples['label'][i])
            # label = 'pontuação de similaridade: ' + str(examples['label'][i])

        if model_type=='decoder':
            if inserir_beginoftext_token:
                input += target_bos_token
            else:
                label = ' pontuação de similaridade: ' + label
        label += tokenizer.eos_token

        new_examples['text'].append(input)
        new_examples['labels'].append(label)

    return new_examples

import datasets
ds['stsb'] = datasets.load_dataset("dlb/plue","stsb",cache_dir="./cache")

identificador_de_dataset['stsb']=criar_identificador()
metric_functions['stsb'] = stsb_metric
map_functions['stsb'] = stsb_map_fc

new_features = ds['stsb']['train'].features.copy()
new_features['label'] = datasets.Value(dtype='string', id=None)
ds['stsb'] = ds['stsb'].cast(new_features)

ds['stsb']

### WNLI

Train: 635 exemplos, Validation: 71 exemplos

In [None]:
from evaluate import load

def wnli_metric(predictions,labels):
    exact_match_metric = load("exact_match")
    result = exact_match_metric.compute(predictions=predictions,references=labels)

    return {'wnli_acc': result['exact_match']}

def wnli_map_fc(examples):
    new_examples = { 'text':[], 'labels':[]}
   
    first_key=list(examples.keys())[0]
    for i in range(0,len(examples[first_key])):
        input=f'wnli sentença 1: {examples["sentence1"][i]}'
        input+=f' sentença 2: {examples["sentence2"][i]}'
        if examples['label'][i] == 0:
            label = 'Não é implicação lógica'
        elif examples['label'][i] == 1:
            label = 'Implicação lógica'
        elif examples['label'][i] == -1:
            # Este caso é o set de teste
            continue

        if model_type=='decoder':
            if inserir_beginoftext_token:
                input += target_bos_token
            else:
                label = ' classe: ' + label
        label += tokenizer.eos_token

        new_examples['text'].append(input)
        new_examples['labels'].append(label)

    return new_examples

import datasets
ds['wnli'] = datasets.load_dataset("dlb/plue","wnli",cache_dir="./cache")

identificador_de_dataset['wnli']=criar_identificador()
metric_functions['wnli'] = wnli_metric
map_functions['wnli'] = wnli_map_fc
ds['wnli']

## Sumarização

#### WIKI LINGUA

sumarização de textos

https://huggingface.co/datasets/wiki_lingua

Train: 25328 exemplos, Validation: 2815 exemplos

In [None]:
# from evaluate import load

# def wiki_lingua_metric(predictions,labels):
#     rouge_metric = load('rouge')
#     result = rouge_metric.compute(predictions=predictions,references=labels)

#     new_result = {}
#     for key in result:
#         new_result['wiki_lingua_' + key] = result[key]
#     result = new_result

#     return result

# def wiki_lingua_map_fc(examples):
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
#     for i in range(0,len(examples[first_key])):

#         for j in range(0,len(examples['article'][i]['document'])):
#             titulo = examples['article'][i]['section_name'][j]
#             documento = examples['article'][i]['document'][j]
#             resumo = examples['article'][i]['summary'][j]

#             input=f'Resuma o texto: {titulo}. {documento}'

#             label = resumo

#             if model_type=='decoder':
#                 if inserir_beginoftext_token:
#                     input += target_bos_token
#                 else:
#                     label = ' Resumo: ' + label
#             label += tokenizer.eos_token

#             new_examples['text'].append(input)
#             new_examples['labels'].append(label)

#     return new_examples

# import datasets
# ds['wiki_lingua'] = datasets.load_dataset('wiki_lingua','portuguese')

# identificador_de_dataset['wiki_lingua']=criar_identificador()
# metric_functions['wiki_lingua'] = wiki_lingua_metric
# map_functions['wiki_lingua'] = wiki_lingua_map_fc

# ds['wiki_lingua'] = ds['wiki_lingua']['train'].train_test_split(test_size=0.1,seed=42)
# ds['wiki_lingua'] = datasets.DatasetDict({ 'train': ds['wiki_lingua']['train'], 'validation':ds['wiki_lingua']['test']})
# ds['wiki_lingua']

### XLSUM

https://huggingface.co/datasets/csebuetnlp/xlsum

Train: 57402 exemplos, Validation: 7175 exemplos

In [None]:
# from evaluate import load

# def xlsum_metric(predictions,labels):
#     rouge_metric = load('rouge')
#     result = rouge_metric.compute(predictions=predictions,references=labels)

#     new_result = {}
#     for key in result:
#         new_result['xlsum_' + key] = result[key]
#     result = new_result

#     return result

# def xlsum_map_fc(examples):
#     new_examples = { 'text':[], 'labels':[]}
   
#     first_key=list(examples.keys())[0]
#     for i in range(0,len(examples[first_key])):

#         input=f'Resuma o texto: {examples["title"][i]}. {examples["text"][i]}'

#         label = examples['summary'][i]

#         if model_type=='decoder':
#             if inserir_beginoftext_token:
#                 input += target_bos_token
#             else:
#                 label = ' Resumo: ' + label
#         label += tokenizer.eos_token

#         new_examples['text'].append(input)
#         new_examples['labels'].append(label)

#     return new_examples

# import datasets
# ds['xlsum'] = datasets.load_dataset('csebuetnlp/xlsum','portuguese')

# identificador_de_dataset['xlsum']=criar_identificador()
# metric_functions['xlsum'] = xlsum_metric
# map_functions['xlsum'] = xlsum_map_fc
# ds['xlsum']

# Pré-processar o dataset

## Aplica a função map individual de cada dataset

In [None]:
ds_processado = ds

for key in ds.keys():

    if type(map_functions[key])==dict:
        # O dataset squad tem pecurialidade de ter diversas respostas possíveis
        for split in map_functions[key]:
            ds_processado[key][split] = ds[key][split].map(
                map_functions[key][split],
                batched=True,
                batch_size=1_000,
                remove_columns=ds[key][split].column_names,
                num_proc=1
            )
    else:
        ds_processado[key] = ds[key].map(
            map_functions[key],
            batched=True,
            batch_size=1_000,
            remove_columns=ds[key][list(ds[key].keys())[0]].column_names,
            num_proc=1
        )

    # deleta os splits sem exemplos (e.g., o split 'test' do wnli)
    delete_split=[]
    for split in ds_processado[key]:
        if len(ds_processado[key][split])==0:
            delete_split.append(split)

    for split in delete_split:
        del ds_processado[key][split]

ds_processado

## Insere um prefixo prefix_input ao início dos inputs_ids, caso tenha sido definido

In [None]:
if 'prefix_input' in locals() and prefix_input!=None and len(prefix_input)>0:
    def tokenize_dataset(examples):
       # Acrescenta um prefixo nos input_ids para os modelos do tipo UL2, (prefixos como <|NLU|> ou <|NLG|>)
        for i in range(0,len(examples['text'])):
            examples['text'][i] = prefix_input + examples['text'][i]

        return examples

    for key in ds_processado.keys():
        print(key)
        ds_processado[key] = ds_processado[key].map(
            tokenize_dataset,
            batched=True,
            batch_size=1_000,
            num_proc=2
        )

In [None]:
for key in ds_processado:
    print('dataset',key)
    print(ds_processado[key][list(ds_processado[key].keys())[0]][0])
    print(ds_processado[key][list(ds_processado[key].keys())[0]][1])

## Tokeniza o dataset

In [None]:
def tokenize_dataset(examples):

    examples['input_ids']=tokenizer(examples['text'],
                      return_attention_mask=False,
                      truncation=True,
                      max_length=model.config.n_positions,
                      )['input_ids']

    examples['labels']=tokenizer(examples['labels'],
                      return_attention_mask=False,
                      truncation=True,
                      max_length=model.config.n_positions,
                      )['input_ids']
    # Insere o eos_token_id caso não tenha sido inserido anteriormente
    for i, label in enumerate(examples['labels']):
        try:
            if label[len(label)-1]!=tokenizer.eos_token_id:
                examples['labels'][i] += [tokenizer.eos_token_id]            
        except:
            # Caso por erro não haja label
            examples['labels'][i] = [tokenizer.eos_token_id]            
            pass
                      
    return examples

for key in ds_processado.keys():
    print(key)
    ds_tokenizado[key] = ds_processado[key].map(
        tokenize_dataset,
        batched=True,
        batch_size=1_000,
        num_proc=2
    )
    ds_tokenizado[key]=ds_tokenizado[key].remove_columns("text")

## Insere um token inicial identificando qual é o dataset nos dados de validação para o computo das métricas

In [None]:
def inserir_identificador(examples):
    if model_type=='encoder-decoder':  
        for i in range(0,len(examples['labels'])):
            examples['labels'][i]=[identificador_de_dataset[dataset_name]] + examples['labels'][i]

    # no decoder, inserimos no começo tanto o identificador do dataset quanto o tamanho do input inicial para a remoção
    elif model_type=='decoder':  
        for i in range(0,len(examples['labels'])):
            examples['labels'][i] = [identificador_de_dataset[dataset_name]] + [len(examples['input_ids'][i])]  + examples['labels'][i]

    return examples

In [None]:
for key in ds_tokenizado.keys():
    dataset_name=key
    if 'test' in ds_tokenizado[key]:
        ds_tokenizado[key]['test'] = ds_tokenizado[key]['test'].map(
            inserir_identificador,
            batched=True,
            batch_size=1_000,
            num_proc=1
        )
    ds_tokenizado[key]['validation']= ds_tokenizado[key]['validation'].map(
        inserir_identificador,
        batched=True,
        batch_size=1_000,
        num_proc=1
    )

## Concatena os diversos datasets

In [None]:
concat={ 'train':[],'validation':[], 'test':[]}

for key in ds_tokenizado:
    for split in ds_tokenizado[key].keys():
        concat[split].append(ds_tokenizado[key][split])

# Concatena os datasets e randomiza a ordem
ds_final = datasets.DatasetDict({
    'train': datasets.concatenate_datasets(concat['train']),
    'validation': datasets.concatenate_datasets(concat['validation']),
})

ds_final.shuffle()
ds_final

# Cria a métrica para avaliar os datasets

## Métrica do encoder-decoder

In [None]:
import numpy as np

# Identifica qual o dataset
def retorna_chave_do_dicionario(dictionary, search_value):
    for key, value in dictionary.items():
        if value == search_value:
            return key

# Cálculo das métricas por dataset
def compute_metrics_encoder_decoder(eval_pred):
    predictions, labels = eval_pred
    result = {}
    
    dataset_name=[]

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    predictions=list(predictions)
    labels=list(labels)

    # identifica qual é o dataset, e remove o identificador
    for i in range(0,len(labels)):
        identificador=labels[i][0]        
        dataset_name.append(retorna_chave_do_dicionario(identificador_de_dataset,identificador))
        labels[i]=labels[i][1:len(labels[i])]             
    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # print('Decoded labels')
    # print(decoded_labels)
    # print('Decoded predictions')
    # print(decoded_preds)

    # inicaliza o dicionário
    decoded_por_dataset = {}

    for key in identificador_de_dataset:
        decoded_por_dataset[key]={}
        decoded_por_dataset[key]['decoded_labels']=[]
        decoded_por_dataset[key]['decoded_preds']=[]
    
    for i in range(0,len(decoded_labels)):
        key = dataset_name[i]
        decoded_por_dataset[key]['decoded_labels'].append(decoded_labels[i])
        decoded_por_dataset[key]['decoded_preds'].append(decoded_preds[i])
    
    result = {}
    for key in decoded_por_dataset:
        # print('dataset',key)
        # print('examples',len(decoded_por_dataset[key]['decoded_labels']))
        res = metric_functions[key](decoded_por_dataset[key]['decoded_preds'],decoded_por_dataset[key]['decoded_labels'])
        
        result.update(res)

    return result

## Métrica do decoder

In [None]:
import numpy as np

def retorna_chave_do_dicionario(dictionary, search_value):
    for key, value in dictionary.items():
        if value == search_value:
            return key

def compute_metrics_decoder(eval_pred):
    predictions, labels = eval_pred
    result = {}
    
    dataset_name=[]    

    predictions=list(predictions)
    labels=list(labels)
    
    for i in range(0,len(labels)):
        labels[i] = list(filter(lambda x: x!= -100, labels[i]))
        predictions[i] = list(filter(lambda x: x!= -100, predictions[i]))

        # remove os pad_tokens
        labels[i] = list(filter(lambda x: x!= tokenizer.pad_token_id, labels[i]))
        predictions[i] = list(filter(lambda x: x!= tokenizer.pad_token_id, predictions[i]))

        # remove os eos_tokens
        labels[i] = list(filter(lambda x: x!= tokenizer.eos_token_id, labels[i]))
        predictions[i] = list(filter(lambda x: x!= tokenizer.eos_token_id, predictions[i]))

    # identifica qual é o dataset, e remove o identificador
    for i in range(0,len(labels)):
        identificador=labels[i][0]
        dataset_name.append(retorna_chave_do_dicionario(identificador_de_dataset,identificador))
        labels[i]=labels[i][1:len(labels[i])]  

    # remove o começo da string referente ao input
    for i in range(0,len(labels)):
        length_input=labels[i][0]

        labels[i]=labels[i][1:len(labels[i])]
        predictions[i]=predictions[i][length_input:len(predictions[i])]

    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # print('Decoded labels')
    # print(decoded_labels)
    # print('Decoded predictions')
    # print(decoded_preds)


    # inicaliza o dicionário
    decoded_por_dataset = {}

    for key in identificador_de_dataset:
        decoded_por_dataset[key]={}
        decoded_por_dataset[key]['decoded_labels']=[]
        decoded_por_dataset[key]['decoded_preds']=[]
    
    for i in range(0,len(decoded_labels)):
        key = dataset_name[i]
        decoded_por_dataset[key]['decoded_labels'].append(decoded_labels[i])
        decoded_por_dataset[key]['decoded_preds'].append(decoded_preds[i])

    result = {}
    for key in decoded_por_dataset:
        # print('dataset',key)
        # print('examples',len(decoded_por_dataset[key]['decoded_labels']))
        res = metric_functions[key](decoded_por_dataset[key]['decoded_preds'],decoded_por_dataset[key]['decoded_labels'])
        
        result.update(res)

    return result

# DataCollator para Decoders (Causal Language Modeling)

Enquanto os modelos encoder-decoder possuem o input e os outputs separados, nos modelos decoder, só há um vetor de texto.

No finetunning para classificação dos modelos decoders, colocaremos para o modelo apenas prever o texto do label. Assim, a parte do input será atribuída um label de valor -100, assim o modelo saberá que não deve ser calculado *loss* para esses tokens.

In [None]:
import random
import warnings
from collections.abc import Mapping
from dataclasses import dataclass
from random import randint
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

import numpy as np

from transformers.models.bert import BertTokenizer, BertTokenizerFast
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy

import torch
import transformers.data.data_collator
from transformers.data.data_collator import _torch_collate_batch


class DataCollatorMixin:
    def __call__(self, features, return_tensors=None):
        if return_tensors is None:
            return_tensors = self.return_tensors
        if return_tensors == "tf":
            return self.tf_call(features)
        elif return_tensors == "pt":
            return self.torch_call(features)
        elif return_tensors == "np":
            return self.numpy_call(features)
        else:
            raise ValueError(f"Framework '{return_tensors}' not recognized!")

@dataclass
class DataCollatorWithPaddingModified:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
              sequence is provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:        
        inputs=[]
        labels=[]
        attention_mask=[]
        
        # Utiliza o primeiro token para identificar se é o dataset de validação
        is_validation_dataset = (features[0]['labels'][0] > len(tokenizer))
        if is_validation_dataset:
            i = 0
            for feat in features:
                labels.append(feat['labels'])
                inputs.append(feat['input_ids'])   

        else: # training datset       
            for feat in features:
                labels.append([-100] * len(feat['input_ids']) + feat['labels'])
                inputs.append(feat['input_ids'] + feat['labels'])

        # artifício para dar pad nos inputs e labels ao mesmo tempo
        inputs = {'input_ids' : inputs + labels}
        
        previous_level = transformers.logging.get_verbosity()
        transformers.logging.set_verbosity_error()  

        batch = self.tokenizer.pad(
            inputs,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        
        transformers.logging.set_verbosity(previous_level) ####

        half_idx = len(labels)

        batch['labels'] = batch['input_ids'][half_idx:len(batch['input_ids'])]
        batch['input_ids'] = batch['input_ids'][0:half_idx]
        batch['attention_mask'] = batch['attention_mask'][0:half_idx]

        
        batch['labels'][batch['labels'] == self.tokenizer.pad_token_id] = -100

        if "label" in batch:
            batch["labels"] = batch["label"]
            del batch["label"]
        if "label_ids" in batch:
            batch["labels"] = batch["label_ids"]
            del batch["label_ids"]

        return batch

# DataCollator para Encoder-Decoder

A única modificação feita do código original é silenciar o tokenizador durante o pad
https://github.com/huggingface/transformers/blob/v4.28.1/src/transformers/data/data_collator.py

In [None]:
import random
import warnings
from collections.abc import Mapping
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

from transformers.models.bert import BertTokenizer, BertTokenizerFast
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy

@dataclass
class DataCollatorForSeq2SeqModified:
    """
    Data collator that will dynamically pad the inputs received, as well as the labels.
    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
        model ([`PreTrainedModel`]):
            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
            prepare the *decoder_input_ids*
            This is useful when using *label_smoothing* to avoid calculating loss twice.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
              sequence is provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    tokenizer: PreTrainedTokenizerBase
    model: Optional[Any] = None
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100
    return_tensors: str = "pt"

    def __call__(self, features, return_tensors=None):
        if return_tensors is None:
            return_tensors = self.return_tensors
        labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None

        previous_level = transformers.logging.get_verbosity()
        transformers.logging.set_verbosity_error()     

        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
        # same length to return tensors.
        if labels is not None:
            max_label_length = max(len(l) for l in labels)
            if self.pad_to_multiple_of is not None:
                max_label_length = (
                    (max_label_length + self.pad_to_multiple_of - 1)
                    // self.pad_to_multiple_of
                    * self.pad_to_multiple_of
                )

            padding_side = self.tokenizer.padding_side
            for feature in features:
                remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
                if isinstance(feature["labels"], list):
                    feature["labels"] = (
                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
                    )
                elif padding_side == "right":
                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
                else:
                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)

        features = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=return_tensors,
        )
        

        # prepare decoder_input_ids
        if (
            labels is not None
            and self.model is not None
            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
        ):
            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
            features["decoder_input_ids"] = decoder_input_ids

        transformers.logging.set_verbosity(previous_level) ####
        return features

# Treina o modelo

## Ajusta a classe Trainer do hugginface

Foi alterada na classe Trainer a configuração de geração de textos de validação e silenciado os avisos na geração

In [None]:
# https://github.com/huggingface/transformers/blob/v4.26.1/src/transformers/trainer_seq2seq.py
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Dict, List, Optional, Tuple, Union

import torch
from torch import nn
from torch.utils.data import Dataset

from transformers.deepspeed import is_deepspeed_zero3_enabled
from transformers.trainer import Trainer
from transformers.trainer_utils import PredictionOutput
from transformers.utils import logging
import transformers


logger = logging.get_logger(__name__)

class Seq2SeqTrainerModified(Trainer):
    def evaluate(
        self,
        eval_dataset: Optional[Dataset] = None,
        ignore_keys: Optional[List[str]] = None,
        metric_key_prefix: str = "eval",
        **gen_kwargs
    ) -> Dict[str, float]:
        """
        Run evaluation and returns metrics.
        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
        (pass it to the init `compute_metrics` argument).
        You can also subclass and override this method to inject custom behavior.
        Args:
            eval_dataset (`Dataset`, *optional*):
                Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
                method.
            ignore_keys (`List[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                "eval_bleu" if the prefix is `"eval"` (default)
            max_length (`int`, *optional*):
                The maximum target length to use when predicting with the generate method.
            num_beams (`int`, *optional*):
                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
                beam search.
            gen_kwargs:
                Additional `generate` specific kwargs.
        Returns:
            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
            dictionary also contains the epoch number which comes from the training state.
        """

        gen_kwargs = gen_kwargs.copy()
        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
            gen_kwargs["max_length"] = self.args.generation_max_length
        gen_kwargs["num_beams"] = (
            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
        )
        self._gen_kwargs = gen_kwargs

        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)

    def predict(
        self,
        test_dataset: Dataset,
        ignore_keys: Optional[List[str]] = None,
        metric_key_prefix: str = "test",
        **gen_kwargs
    ) -> PredictionOutput:
        """
        Run prediction and returns predictions and potential metrics.
        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
        will also return metrics, like in `evaluate()`.
        Args:
            test_dataset (`Dataset`):
                Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
                `model.forward()` method are automatically removed. Has to implement the method `__len__`
            ignore_keys (`List[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                "eval_bleu" if the prefix is `"eval"` (default)
            max_length (`int`, *optional*):
                The maximum target length to use when predicting with the generate method.
            num_beams (`int`, *optional*):
                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
                beam search.
            gen_kwargs:
                Additional `generate` specific kwargs.
        <Tip>
        If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
        padding in a token classification task) the predictions will be padded (on the right) to allow for
        concatenation into one array. The padding index is -100.
        </Tip>
        Returns: *NamedTuple* A namedtuple with the following keys:
            - predictions (`np.ndarray`): The predictions on `test_dataset`.
            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
              labels).
        """

        gen_kwargs = gen_kwargs.copy()
        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
            gen_kwargs["max_length"] = self.args.generation_max_length
        gen_kwargs["num_beams"] = (
            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
        )
        self._gen_kwargs = gen_kwargs

        return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)

    def prediction_step(
        self,
        model: nn.Module,
        inputs: Dict[str, Union[torch.Tensor, Any]],
        prediction_loss_only: bool,
        ignore_keys: Optional[List[str]] = None,
    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
        """
        Perform an evaluation step on `model` using `inputs`.
        Subclass and override to inject custom behavior.
        Args:
            model (`nn.Module`):
                The model to evaluate.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (`bool`):
                Whether or not to return the loss only.
        Return:
            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
            labels (each being optional).
        """

        if not self.args.predict_with_generate or prediction_loss_only:
            return super().prediction_step(
                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
            )

        has_labels = "labels" in inputs
        inputs = self._prepare_inputs(inputs)

        # XXX: adapt synced_gpus for fairscale as well
        gen_kwargs = self._gen_kwargs.copy()
        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
            gen_kwargs["max_length"] = self.model.config.max_length
        gen_kwargs["num_beams"] = (
            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
        )
        default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
        gen_kwargs["synced_gpus"] = (
            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
        )

        if "attention_mask" in inputs:
            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
        if "global_attention_mask" in inputs:
            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)

        # prepare generation inputs
        # some encoder-decoder models can have varying encoder's and thus
        # varying model input names
        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
            generation_inputs = inputs[self.model.encoder.main_input_name]
        else:
            generation_inputs = inputs[self.model.main_input_name]

        ##### Alteração
        gen_kwargs["max_new_tokens"] = MAX_TOKEN_GENERATION_LENGTH
        del gen_kwargs["max_length"]
        gen_kwargs["eos_token_id"]=self.tokenizer.eos_token_id
        previous_level = transformers.logging.get_verbosity()
        transformers.logging.set_verbosity_error()        
        #####        
        generated_tokens = self.model.generate(
            generation_inputs,
            **gen_kwargs
        )
        transformers.logging.set_verbosity(previous_level) ####
        

        # in case the batch is shorter than max length, the output should be padded
        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
            gen_kwargs["max_new_tokens"] + 1
        ):
            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)

        with torch.no_grad():
            if has_labels:
                with self.compute_loss_context_manager():
                    outputs = model(**inputs)
                if self.label_smoother is not None:
                    loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
                else:
                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
            else:
                loss = None

        if self.args.prediction_loss_only:
            return (loss, None, None)

        if has_labels:
            labels = inputs["labels"]
            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
                gen_kwargs["max_new_tokens"] + 1
            ):
                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
        else:
            labels = None

        transformers.logging.set_verbosity(previous_level) ####
        return (loss, generated_tokens, labels)

    def _pad_tensors_to_max_len(self, tensor, max_length):
        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
            # If PAD token is not defined at least EOS token has to be defined
            pad_token_id = (
                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
            )
        else:
            if self.model.config.pad_token_id is not None:
                pad_token_id = self.model.config.pad_token_id
            else:
                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")

        padded_tensor = pad_token_id * torch.ones(
            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
        )
        padded_tensor[:, : tensor.shape[-1]] = tensor
        return padded_tensor

## Treina o model

In [None]:
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import os


if model_type=='decoder':
    data_collator = DataCollatorWithPaddingModified(tokenizer,max_length=model.config.n_positions,pad_to_multiple_of=8,return_tensors='pt')
    compute_metrics=compute_metrics_decoder    
    learning_rate=1e-4
elif model_type=='encoder-decoder':
    data_collator = DataCollatorForSeq2SeqModified(tokenizer,model=model,max_length=model.config.n_positions,pad_to_multiple_of=8,return_tensors='pt')
    compute_metrics=compute_metrics_encoder_decoder
    learning_rate=1e-4

args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    # save_strategy="epoch",        
    # save_total_limit=10,
    # load_best_model_at_end=True,

    evaluation_strategy="epoch",
    eval_steps=1,
    # evaluation_strategy="steps",
    # eval_steps=int(len(ds_final['train'])//5),
    logging_strategy="epoch",
    logging_steps=1,
    predict_with_generate=True,
    

    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps = gradient_accumulation_steps,        

    num_train_epochs=epochs,

    lr_scheduler_type="constant",
    learning_rate=learning_rate,
    weight_decay=0.1,
    
    fp16=True,
    fp16_full_eval=False,
    dataloader_num_workers=1,   
)


trainer = Seq2SeqTrainerModified(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=ds_final["train"],
    eval_dataset=ds_final["validation"],
    compute_metrics=compute_metrics,
)

trainer.train()

# Gera texto pelo modelo finetune

In [None]:
import torch
from transformers import pipeline
import pandas as pd

texts=[]

for key in ds_processado.keys():
    if 'train' in ds_processado[key]: 
        texts.append(ds_processado[key]['train'][0]['text'])
        texts.append(ds_processado[key]['train'][1]['text'])

    texts.append(ds_processado[key]['validation'][0]['text'])
    texts.append(ds_processado[key]['validation'][1]['text'])


model.to('cpu')
pred=[]
for text in texts:    
    pred.append(tokenizer.batch_decode(model.generate(tokenizer.encode(text,return_tensors='pt'),max_new_tokens=20,eos_token_id=tokenizer.eos_token_id)))

for i in range(0,len(texts)):
    print('input:',texts[i])
    print('generated:',pred[i])
    print('')

# Desconectar do COLAB

In [None]:
from google.colab import runtime

runtime.unassign()