<a href="https://colab.research.google.com/github/vlordier/colabs/blob/main/summarization_active_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Based on https://towardsdatascience.com/fine-tuning-a-t5-transformer-for-any-summarization-task-82334c64c81

## Dataset & Augmentation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
json_path = "/content/summarization_ds_17000.json"

In [None]:
!pip install rouge_score



In [None]:
!pip install datasets



In [None]:
!pip install numpy requests nlpaug
!pip install torch transformers sentencepiece pytorch-lightning



In [None]:
from datasets import load_metric

In [None]:
import numpy as np
import json
import argparse

from torch.utils.data import Dataset, DataLoader
import nlpaug.augmenter.word as naw

np.random.seed(42)

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
class TextSummaryDataset(Dataset):
    def __init__(self, json_path, ds_params, tokenizer=None, stage='val', 
                 synonym_model_path="/content/drive/MyDrive/upwork/ppdb-2.0-m-all"):
        self.tokenizer = tokenizer
        self.ds_params = ds_params
        with open(json_path) as f:
            data = json.load(f)[:2000]
        self.data = self.choose_data(data, stage, self.ds_params.val_part)
        # print(self.data)
        # self.augm_contextual = naw.ContextualWordEmbsAug(
        #     model_path='bert-base-uncased', action="insert")
        self.augm_spelling = naw.SpellingAug()
        self.augm_synonym = naw.SynonymAug(aug_src='ppdb', 
                                           model_path=synonym_model_path)
        self._augment_texts()
        self.summary_keys = ["summary_1", "summary_2", "summary_3"]

    def _augment_texts(self):
        for text_item in self.data:
            text_aug = text_item["original_text"]
            # text_aug = self.augm_contextual.augment(text_aug)
            text_aug = self.augm_synonym.augment(text_aug)
            text_aug = self.augm_spelling.augment(text_aug)
            text_item["original_text_augm"] = text_aug

    def __len__(self):
        return len(self.data)

    def choose_data(self, data, stage, val_part=0.2):
        np.random.seed(42)
        data = np.array(data)
        inds = np.arange(len(data))
        np.random.shuffle(inds)
        split = int(len(data) * (1 - val_part))
        if stage == 'train':
            return data[inds[:split]]
        else:
            return data[inds[split:]]


    def clean_text(self, text):
        text = text.replace('Example of text:', '')
        text = text.replace('Example of Summary:', '')
        text = text.replace('\n','')
        text = text.replace('``', '')
        text = text.replace('"', '')
        
        return text

    def convert_to_features(self, text, summary):
        # Tokenize contexts and questions (as pairs of inputs)
        
        
#         input_ = self.clean_text(example_batch['text']) + " </s>"
#         target_ = self.clean_text(example_batch['headline']) + " </s>"
        
        source = self.tokenizer.batch_encode_plus(
            [text], max_length=self.ds_params.input_length, 
            padding='max_length', truncation=True, return_tensors="pt")
        
        targets = self.tokenizer.batch_encode_plus(
            [summary], max_length=self.ds_params.output_length, 
            padding='max_length', truncation=True, return_tensors="pt")
    
       
        return source, targets

    def __getitem__(self, idx):

        _, text, summary = self.get_raw_texts(idx)

        source, targets = self.convert_to_features(text, summary)
        
        source_ids = source["input_ids"].squeeze()
        target_ids = targets["input_ids"].squeeze()

        src_mask    = source["attention_mask"].squeeze()
        target_mask = targets["attention_mask"].squeeze()

        return {"source_ids": source_ids, 
                "source_mask": src_mask, 
                "target_ids": target_ids, 
                "target_mask": target_mask}

    def get_raw_texts(self, idx):
        record = self.data[idx]
        text = record["original_text_augm"]
        text_src = record["original_text"]
        summary_key_rand = np.random.choice(self.summary_keys)
        summary = record[summary_key_rand]

        text = self.clean_text(text)
        summary = self.clean_text(summary)

        return text_src, text, summary



In [None]:
dataset = TextSummaryDataset(json_path, argparse.Namespace(**{"val_part": 0.2}))

In [None]:
for i in range(len(dataset)):
    d = dataset.get_raw_texts(i)
    print("\nOriginal text:\n", d[0])
    print("Original text augmented:\n", d[1])
    print("Summary:\n", d[2])


Original text:
 the soccer world cup will be the first sporting event to break the one billion pounds -lrb- #.## billion dollars -rrb- barrier in british betting turnover , said a leading bookmaker on friday .
Original text augmented:
 the football worldwide couple will become thet earliest sporting ivent to breaches da one billon paunds - lrb - #. # # billion riyals - rrb - obstacle in birtish wagers turnover, explanations o guaranteeing bookmaker On friday.
Summary:
 world cup bets set to top one billion pounds in britain

Original text:
 most asian markets gained friday , following wall street 's lead after oil prices slid further and a report showed the u.s. economy grew faster in the last quarter than previously thought .
Original text augmented:
 most asian oeration gained friday, to wal STRRET ' s lead afet oil pricing slid feather è la reviews disclosed the guarantees. so. economie grew faster in the past trimester than formely imagined.
Summary:
 asia markets track wall stree

## Fine Tuning

### Model

In [None]:
import time
import random
import torch
import logging
import pytorch_lightning as pl
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
# from nlp import load_metric

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        # print(type(self.hparams))
        self.hparams.update(hparams)
        self.hparams.ds_params = argparse.Namespace(**self.hparams.ds_params)
        self.model = T5ForConditionalGeneration.from_pretrained(self.hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(self.hparams.tokenizer_name_or_path)
        self.rouge_metric = load_metric('rouge') 
        self.rouge_metric._init_writer()
        
        if self.hparams.freeze_embeds:
            self.freeze_embeds()
        if self.hparams.freeze_encoder:
            self.freeze_params(self.model.get_encoder())
            assert_all_frozen(self.model.get_encoder())
        
    
    def freeze_params(self, model):
        for par in model.parameters():
            par.requires_grad = False
            
            
    def freeze_embeds(self):
        """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
        try:
            self.freeze_params(self.model.model.shared)
            for d in [self.model.model.encoder, self.model.model.decoder]:
                freeze_params(d.embed_positions)
                freeze_params(d.embed_tokens)
        except AttributeError:
            self.freeze_params(self.model.shared)
            for d in [self.model.encoder, self.model.decoder]:
                self.freeze_params(d.embed_tokens)
    
    def lmap(self, f, x):
        """list(map(f, x))"""
        return list(map(f, x))
    

    def is_logger(self):
        return self.trainer.global_rank <= 0
    
    
    def parse_score(self, result):
        return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
        
    def forward(self, input_ids, attention_mask=None, 
                decoder_input_ids=None, decoder_attention_mask=None, 
                lm_labels=None):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
    )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss
    
    
    def ids_to_clean_text(self, generated_ids):
        gen_text = self.tokenizer.batch_decode(
            generated_ids, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=True
        )
        return self.lmap(str.strip, gen_text)
    
    
    def _generative_step(self, batch) :
        
        t0 = time.time()
        
        generated_ids = self.model.generate(
            batch["source_ids"],
            attention_mask=batch["source_mask"],
            use_cache=True,
            decoder_attention_mask=batch['target_mask'],
            max_length=150, 
            num_beams=2,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True
        )
        preds = self.ids_to_clean_text(generated_ids)
        target = self.ids_to_clean_text(batch["target_ids"])
            
        gen_time = (time.time() - t0) / batch["source_ids"].shape[0]  
    
        loss = self._step(batch)
        base_metrics = {'val_loss': loss}
#         rouge: Dict = self.calc_generative_metrics(preds, target)
        summ_len = np.mean(self.lmap(len, generated_ids))
        base_metrics.update(gen_time=gen_time, 
                            gen_len=summ_len, 
                            preds=preds, 
                            target=target)
        self.rouge_metric.add_batch(predictions=preds, references=target)
        # print(self.rouge_metric.writer)
        rouge_results = self.rouge_metric.compute() 
        rouge_dict = self.parse_score(rouge_results)
        base_metrics.update(rouge1=rouge_dict['rouge1'], rougeL=rouge_dict['rougeL'])
        
        return base_metrics
    

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}
  
    # def training_epoch_end(self, outputs):
    #     avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    #     tensorboard_logs = {"avg_train_loss": avg_train_loss}
    #     return {"avg_train_loss": avg_train_loss, 
    #             "log": tensorboard_logs, 
    #             "progress_bar": tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        # print(id(self.rouge_metric))
        return self._generative_step(batch)
    
  
    def validation_epoch_end(self, outputs):
        # print(id(self.rouge_metric))
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        # print(self.rouge_metric.writer)
        # rouge_results = self.rouge_metric.compute() 
        # rouge_dict = self.parse_score(rouge_results)
    
        # tensorboard_logs.update(rouge1=rouge_dict['rouge1'], rougeL=rouge_dict['rougeL'])
        
        ## Clear out the lists for next epoch
        self.target_gen= []
        self.prediction_gen=[]
        return {"avg_val_loss": avg_loss}
                # "rouge1" : rouge_results['rouge1'],
                # "rougeL" : rouge_results['rougeL']}
                # "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() 
                           if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() 
                           if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, 
                          lr=self.hparams.learning_rate, 
                          eps=self.hparams.adam_epsilon)
        self.opt = optimizer

        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.t_total
        )
        self.lr_scheduler = scheduler

        return [optimizer]
  
    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, 
                       optimizer_closure=None,  
                       on_tpu=False, using_native_amp=False, using_lbfgs=False):
        # if self.trainer.use_tpu:
        #     xm.optimizer_step(optimizer)
        # else:
        optimizer.step(closure=optimizer_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), 
                     "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict
    

    def train_dataloader(self):   
        # train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", num_samples=n_samples, args=self.hparams)
        train_dataset = TextSummaryDataset(self.hparams.data_path, self.hparams.ds_params, self.tokenizer, 'train')
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
        self.t_total = (
            (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
            // self.hparams.gradient_accumulation_steps
            * float(self.hparams.num_train_epochs)
        )
        
        return dataloader

    def val_dataloader(self):
        # n_samples = self.n_obs['validation']
        # validation_dataset = get_dataset(tokenizer=self.tokenizer, type_path="validation", num_samples=n_samples, args=self.hparams)
        validation_dataset = TextSummaryDataset(self.hparams.data_path, self.hparams.ds_params, self.tokenizer, 'val')
        return DataLoader(validation_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)
    


In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

### Define args

In [None]:
# ! rm -rf /content/lightning_logs/

In [None]:
args_dict = dict(
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    ds_params={
        "val_part": 0.2,
        "input_length": 512,
        "output_length": 150
        },
    freeze_encoder=False,
    freeze_embeds=False,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=4,
    num_train_epochs=2,
    gradient_accumulation_steps=8,
    n_gpu=1,
    resume_from_checkpoint=None, 
    val_check_interval = 1.0, 
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [None]:
args_dict.update({'output_dir': 't5_model_trained', 'num_train_epochs':2,
                 'train_batch_size': 4, 'eval_batch_size': 4, 'data_path': json_path})
args = argparse.Namespace(**args_dict)
print(args)

Namespace(adam_epsilon=1e-08, data_path='/content/summarization_ds_17000.json', ds_params={'val_part': 0.2, 'input_length': 512, 'output_length': 150}, early_stop_callback=False, eval_batch_size=4, fp_16=False, freeze_embeds=False, freeze_encoder=False, gradient_accumulation_steps=8, learning_rate=0.0003, max_grad_norm=1.0, model_name_or_path='t5-small', n_gpu=1, num_train_epochs=2, opt_level='O1', output_dir='t5_model_trained', resume_from_checkpoint=None, seed=42, tokenizer_name_or_path='t5-small', train_batch_size=4, val_check_interval=1.0, warmup_steps=0, weight_decay=0.0)


In [None]:
## Define Checkpoint function
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="val_loss", mode="min", save_top_k=3
)

## If resuming from checkpoint, add an arg resume_from_checkpoint
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    # early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    # amp_level=args.opt_level,
    resume_from_checkpoint=args.resume_from_checkpoint,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    val_check_interval=args.val_check_interval,
    # logger=wandb_logger,
    callbacks=[LoggingCallback()],
)

### Train

In [None]:
len(dataset)

400

In [None]:
model = T5FineTuner(args_dict)

In [None]:
trainer = pl.Trainer(**train_params)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
# trainer.train_dataloader.dataset

In [None]:
trainer.fit(model, model.train_dataloader(), model.val_dataloader())

  cpuset_checked))
  "When using `Trainer(accumulate_grad_batches != 1)` and overriding"
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
trainer.lo

## Active Learning