In [1]:
# !pip install transformers --user
# !pip install pandas --user
import gc
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
def empty_cache():
  gc.collect()
  torch.cuda.empty_cache()

In [4]:
from transformers import BartForConditionalGeneration, BartTokenizer



In [5]:
# !pip install wandb
import wandb

In [6]:
project_name = 'distilbart-sum400-summarized'

In [7]:
device = 'cuda'

In [8]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.ementa = self.data.ementa
        self.inteiro_teor = self.data.inteiro_teor

    def __len__(self):
        return len(self.ementa)

    def __getitem__(self, index):
        inteiro_teor = str(self.inteiro_teor[index])
        inteiro_teor = ' '.join(inteiro_teor.split())

        ementa = str(self.ementa[index])
        ementa = ' '.join(ementa.split())

        source = self.tokenizer.batch_encode_plus([inteiro_teor], max_length=self.source_len, pad_to_max_length=True,return_tensors='pt', truncation=True)
        target = self.tokenizer.batch_encode_plus([ementa], max_length=self.summ_len, pad_to_max_length=True,return_tensors='pt', truncation=True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [9]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]
        
        empty_cache()
        
        if _ % 10 == 0:
            wandb.log({"Loss do treinamento": loss.item()})

        if _ % 500 == 0:
            print(f'Época: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        empty_cache()

In [10]:
import os
# os.environ['WANDB_API_KEY'] = 'key'

In [11]:
def init_wandb(project_name):
    wandb.init(project=project_name) 
    config = wandb.config
    config.TRAIN_BATCH_SIZE = 1
    config.VALID_BATCH_SIZE = 1
    config.TRAIN_EPOCHS = 16
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4
    config.MAX_LEN = 1024
    config.SUMMARY_LEN = 400

    return config

In [12]:
def init_training_pipeline(config):
    print('Iniciando pipeline...\n\n')

    model_name = 'sshleifer/bart-tiny-random'
    tokenizer = BartTokenizer.from_pretrained(model_name)

    empty_cache()
    
    train_dataset = pd.read_csv('../../data/train/translated-train-400-2500-summarized-new.csv', encoding='utf-8', error_bad_lines=False, engine="python")
    train_dataset = train_dataset[['ementa','inteiro_teor']]
    print('Exemplo de textos:')
    print(train_dataset.head(), '\n\n')
    
    val_dataset = pd.read_csv('../../data/train/translated-validate-400-2500-summarized-new.csv', encoding='utf-8', error_bad_lines=False, engine="python")
    val_dataset = val_dataset[['ementa','inteiro_teor']]

    print(f'Dataset de treino: {train_dataset.shape}')
    print(f'Dataset de teste: {val_dataset.shape}')

    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    train_params = {'batch_size': config.TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
    val_params = {'batch_size': config.VALID_BATCH_SIZE, 'shuffle': False, 'num_workers': 0}

    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)
    
    print('Instanciando modelo...')
    model = BartForConditionalGeneration.from_pretrained(model_name)
    model = model.to(device)

    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    empty_cache()

    wandb.watch(model, log="all")
    
    print('Inicializando Fine-Tuning utilizando o dataset de acórdãos...')

    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

        empty_cache()

    print('Treinamento concluído!\n\n')

    return tokenizer, model, val_loader

In [13]:
config = init_wandb(project_name)

ERROR:wandb.jupyter:Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mthiagocmoreira[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.10.1
[34m[1mwandb[0m: Run data is saved locally in wandb/run-20201006_213546-2pqv722y
[34m[1mwandb[0m: Syncing run [33mdaily-planet-8[0m





In [14]:
%%time
tokenizer, model, val_loader = init_training_pipeline(config)

Iniciando pipeline...


Exemplo de textos:
                                              ementa  \
0  interlocutory appeal. Work accident. partial r...   
1  interlocutory appeal. innominate precautionary...   
2  declaration embargoes. addictions. inexistence...   
3  interlocutory appeal. timing. jurisprudential ...   
4  interlocutory appeal. illicit outsourcing. Pre...   

                                        inteiro_teor  
0  judgment 6th class acv / rp This document of i...  
1  the c o rd (1st class class) gmwoc / db seen, ...  
2  judgment 5th class emp / igr visa, reported an...  
3  the c o rd (6th class) gmacc / sc / jr / pv vi...  
4  judgment 6th class acv / acc visa, report and ...   


Dataset de treino: (10791, 2)
Dataset de teste: (2698, 2)
Instanciando modelo...
Inicializando Fine-Tuning utilizando o dataset de acórdãos...
Época: 0, Loss:  10.833200454711914
Época: 0, Loss:  8.484423637390137
Época: 0, Loss:  6.410473346710205
Época: 0, Loss:  6.069296836853027
Épo

In [30]:
open("predictions.csv","w+")

<_io.TextIOWrapper name='predictions.csv' mode='w+' encoding='UTF-8'>

In [26]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    source_texts = []
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['source_ids'].to(device, dtype=torch.long)
            y = data['target_ids'].to(device, dtype=torch.long)
            mask = data['source_mask'].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=350,
                num_beams=2,
                repetition_penalty=1.5,
                length_penalty=1,
                early_stopping=True
            )
            
            src_texts = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in ids]
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]

            if _ % 100 == 0 and _ > 0:
                print(f'Resumos: {_} gerados')
            
            source_texts.extend(src_texts)
            predictions.extend(preds)
            actuals.extend(target)

    return source_texts, predictions, actuals

In [27]:
def validade_and_save_predictions(val_epochs, tokenizer, model, val_loader):
    # Saving the dataframe as predictions.csv
    print('Gerando sumários utilizando o modelo no dataset de validação...')
    for epoch in range(val_epochs):
        source_texts, predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({ 'inteiro_teor_sumarizado': source_texts, 'ementa_original': actuals, 'resumo_gerado': predictions })
        final_df.to_csv('predictions.csv')
        print('CSV para análise gerado!')

In [31]:
%%time
validade_and_save_predictions(config.VAL_EPOCHS, tokenizer, model, val_loader)

Gerando sumários utilizando o modelo no dataset de validação...
Resumos: 100 gerados
Resumos: 200 gerados
Resumos: 300 gerados
Resumos: 400 gerados
Resumos: 500 gerados
Resumos: 600 gerados
Resumos: 700 gerados
Resumos: 800 gerados
Resumos: 900 gerados
Resumos: 1000 gerados
Resumos: 1100 gerados
Resumos: 1200 gerados
Resumos: 1300 gerados
Resumos: 1400 gerados
Resumos: 1500 gerados
Resumos: 1600 gerados
Resumos: 1700 gerados
Resumos: 1800 gerados
Resumos: 1900 gerados
Resumos: 2000 gerados
Resumos: 2100 gerados
Resumos: 2200 gerados
Resumos: 2300 gerados
Resumos: 2400 gerados
Resumos: 2500 gerados
Resumos: 2600 gerados
CSV para análise gerado!
CPU times: user 14min 52s, sys: 1min 56s, total: 16min 49s
Wall time: 16min 36s


In [32]:
generated_summaries = pd.read_csv('predictions.csv', encoding='utf-8', error_bad_lines=False, engine="python")
for index, row in generated_summaries[10:30].iterrows():
    print(f'Exemplo {index}', '\n')
    print('Inteiro teor:', row['inteiro_teor_sumarizado'], '\n\n')
    print('Ementa original:', row['ementa_original'], '\n\n')
    print('Sumário gerado:', row['resumo_gerado'], '\n\n')

Exemplo 10 

Inteiro teor:  8th class) gmmea / hagb / acnv This document of interlocutory appeal was reviewed, reported and discussed in resource of magazine n ° tst-airr-127740-66.2008.5.02.000, in which joyce de lime is aggravating and silva and silva associate lawyers and professional cooperative of the professionals are aggravated credit and collection - cccoop. 95, which denied following up on its review appeal. remittance of the records to the public labor ministry is waived, according to art. thus renews its denunciation of violation of art. it should be noted that, under the terms of art. 896, § 6, of the clt, in the case of a subject subject to the extremely brief procedure, the appropriateness of the review appeal will only be admitted due to the contradiction of the tst jurisprudence summary or direct affront to the constitution of the republic. as a result, the alleged violation of an infraconstitutional precept does not authorize the processing of the appeal. 


Ementa ori

In [33]:
predictions = pd.read_csv('predictions.csv', encoding='utf-8')

In [34]:
from rouge_score import rouge_scorer
import numpy as np

In [35]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

In [36]:
rouge_scores = []
for i in range(len(predictions)):
    rouge_score = scorer.score(predictions['ementa_original'][i], predictions['resumo_gerado'][i])
    rouge_scores.append(rouge_score)

In [37]:
rouge_1_f1 = [i['rouge1'][2] for i in rouge_scores]
rouge_2_f1 = [i['rouge2'][2] for i in rouge_scores]
rouge_L_f1 = [i['rougeL'][2] for i in rouge_scores]

In [38]:
print(np.mean(rouge_1_f1))
print(np.mean(rouge_2_f1))
print(np.mean(rouge_L_f1))

0.28162419985829795
0.13326003384570811
0.21854546473544614


In [40]:
model_archive_name = 'distilbart-sum400-model'
print('Salvando modelo treinado...')
model.save_pretrained(model_archive_name)
print(f'Modelo salvo na pasta {model_archive_name}!\n')

Salvando modelo treinado...
Modelo salvo na pasta distilbart-sum400-model!

