In [2]:
!pip install transformers==3.1 --user # Obrigatório mudar a versão para a 3.1
# !pip install pandas --user
import gc
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
def empty_cache():
    gc.collect()
    torch.cuda.empty_cache()

In [4]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer



In [5]:
# !pip install wandb
import wandb

In [6]:
project_name = 'pegasus-sum400-summarized'

In [7]:
device = 'cuda'

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.ementa = self.data.ementa
        self.inteiro_teor = self.data.inteiro_teor

    def __len__(self):
        return len(self.ementa)

    def __getitem__(self, index):
        inteiro_teor = str(self.inteiro_teor[index])
        inteiro_teor = ' '.join(inteiro_teor.split())

        ementa = str(self.ementa[index])
        ementa = ' '.join(ementa.split())

        source = self.tokenizer.batch_encode_plus([inteiro_teor], max_length=self.source_len, pad_to_max_length=True,return_tensors='pt', truncation=True)
        target = self.tokenizer.batch_encode_plus([ementa], max_length=self.summ_len, pad_to_max_length=True,return_tensors='pt', truncation=True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [9]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]
        
        empty_cache()
        
        if _ % 10 == 0:
            wandb.log({"Loss do treinamento": loss.item()})

        if _ % 500 == 0:
            print(f'Época: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        empty_cache()

In [10]:
import os
# os.environ['WANDB_API_KEY'] = 'key'

In [11]:
def init_wandb(project_name):
    wandb.init(project=project_name) 
    config = wandb.config
    config.TRAIN_BATCH_SIZE = 1
    config.VALID_BATCH_SIZE = 1
    config.TRAIN_EPOCHS = 4
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4
    config.MAX_LEN = 500
    config.SUMMARY_LEN = 400

    return config

In [12]:
def init_training_pipeline(config):
    print('Iniciando pipeline...\n\n')

    model_name = 'google/pegasus-cnn_dailymail'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)

    empty_cache()
    
    train_dataset = pd.read_csv('../../data/train/translated-train-400-2500.csv', encoding='utf-8', error_bad_lines=False, engine="python")
    train_dataset = train_dataset[['ementa','inteiro_teor']]
    print('Exemplo de textos:')
    print(train_dataset.head(), '\n\n')
    
    val_dataset = pd.read_csv('../../data/train/translated-validate-400-2500.csv', encoding='utf-8', error_bad_lines=False, engine="python")
    val_dataset = val_dataset[['ementa','inteiro_teor']]

    print(f'Dataset de treino: {train_dataset.shape}')
    print(f'Dataset de teste: {val_dataset.shape}')

    training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    train_params = {'batch_size': config.TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
    val_params = {'batch_size': config.VALID_BATCH_SIZE, 'shuffle': False, 'num_workers': 0}

    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)
    
    print('Instanciando modelo...')
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    model = model.to(device)

    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    empty_cache()

    wandb.watch(model, log="all")
    
    print('Inicializando Fine-Tuning utilizando o dataset de acórdãos...')

    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

        empty_cache()

    print('Treinamento concluído!\n\n')

    return tokenizer, model, val_loader

In [13]:
config = init_wandb(project_name)

ERROR:wandb.jupyter:Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mthiagocmoreira[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.5 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.10.1
[34m[1mwandb[0m: Run data is saved locally in wandb/run-20201012_134024-1wu4fqma
[34m[1mwandb[0m: Syncing run [33mlunar-music-20[0m





In [None]:
%%time
tokenizer, model, val_loader = init_training_pipeline(config)

In [15]:
open("predictions.csv","w+")

<_io.TextIOWrapper name='predictions.csv' mode='w+' encoding='UTF-8'>

In [16]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    source_texts = []
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['source_ids'].to(device, dtype=torch.long)
            y = data['target_ids'].to(device, dtype=torch.long)
            mask = data['source_mask'].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=250,
                num_beams=2,
                repetition_penalty=1.5,
                length_penalty=1,
                early_stopping=True
            )
            
            src_texts = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in ids]
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]

            if _ % 100 == 0 and _ > 0:
                print(f'Resumos: {_} gerados')
            
#             if _ == 50:
#                 break
            
            source_texts.extend(src_texts)
            predictions.extend(preds)
            actuals.extend(target)

    return source_texts, predictions, actuals

In [17]:
def validade_and_save_predictions(val_epochs, tokenizer, model, val_loader):
    # Saving the dataframe as predictions.csv
    print('Gerando sumários utilizando o modelo no dataset de validação...')
    for epoch in range(val_epochs):
        source_texts, predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({ 'inteiro_teor_sumarizado': source_texts, 'ementa_original': actuals, 'resumo_gerado': predictions })
        final_df.to_csv('predictions.csv')
        print('CSV para análise gerado!')

In [None]:
%%time
# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
validade_and_save_predictions(config.VAL_EPOCHS, tokenizer, model, val_loader)

In [9]:
generated_summaries = pd.read_csv('predictions.csv', encoding='utf-8', error_bad_lines=False, engine="python")
for index, row in generated_summaries[10:30].iterrows():
    print(f'Exemplo {index}', '\n')
    print('Inteiro teor:', row['inteiro_teor'], '\n\n')
    print('Ementa original:', row['ementa_original'], '\n\n')
    print('Sumário gerado:', row['resumo_gerado'], '\n\n')

Exemplo 10 

Inteiro teor: 8th class) gmmea / hagb / acnv This document of interlocutory appeal was reviewed, reported and discussed in resource of magazine n  tst-airr-127740-66.2008.5.02.000, in which joyce de lime is aggravating and silva and silva associate lawyers and professional cooperative of the professionals are aggravated credit and collection - cccoop. 95, which denied following up on its review appeal. remittance of the records to the public labor ministry is waived, according to art. thus renews its denunciation of violation of art. it should be noted that, under the terms of art. 896,  6, of the clt, in the case of a subject subject to the extremely brief procedure, the appropriateness of the review appeal will only be admitted due to the contradiction of the tst jurisprudence summary or direct affront to the constitution of the republic. as a result, the alleged violation of an infraconstitutional precept does not authorize the processing of the appeal. 


Ementa origin

In [3]:
predictions = pd.read_csv('predictions.csv', encoding='utf-8')

In [4]:
# !pip install rouge-score tqdm --user
from tqdm import tqdm_notebook
# import time
from rouge_score import rouge_scorer, scoring
from typing import List, Dict

In [5]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

In [6]:
ROUGE_KEYS = ["rouge1", "rouge2", "rougeL"]
def calculate_rouge(output_lns: List[str], reference_lns: List[str]) -> Dict:
    scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=True)
    aggregator = scoring.BootstrapAggregator()

    for reference_ln, output_ln in tqdm_notebook(zip(reference_lns, output_lns)):
        scores = scorer.score(reference_ln, output_ln)
        aggregator.add_scores(scores)

    result = aggregator.aggregate()
    return {k: v.mid.fmeasure for k, v in result.items()}

In [7]:
metrics = calculate_rouge(predictions['ementa_original'], predictions['resumo_gerado'])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [8]:
metrics

{'rouge1': 0.513370143959208,
 'rouge2': 0.35922481567558473,
 'rougeL': 0.45459596124816376}

In [None]:
# model_archive_name = 'pegasus-sum400-model-8'
# print('Salvando modelo treinado...')
# model.save_pretrained(model_archive_name)
# print(f'Modelo salvo na pasta {model_archive_name}!\n')

In [10]:
import datetime
print('Treino: ', str(datetime.timedelta(seconds=28936)))

Treino:  8:02:16


In [11]:
print('Validação: ', str(datetime.timedelta(seconds=6690)))

Validação:  1:51:30
