<a href="https://colab.research.google.com/github/urmilapol/urmilapolprojects/blob/master/T5_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine Tuning Transformer for Summary Generation

* T5 network loaded via huggingface transformers using pytorch
* run in colab on free gpu (batch-size <= 4 for T5-base / 128 for T5-small)
---


adapted from https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb#scrollTo=j9TNdHlQ0CLz

In [None]:
!pip install transformers -q

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

Check GPU

In [None]:
!nvidia-smi

Wed Oct 28 10:30:07 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    12W /  70W |     10MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# specifiy dataset and column names
path = './data/news_summary.csv'
summary_col = 'text'
fulltext_col = 'ctext'

config = {
    'TRAIN_BATCH_SIZE': 4,     # input batch size for training (default: 64)
    'VALID_BATCH_SIZE': 4,   # input batch size for testing (default: 1000)
    'TRAIN_EPOCHS' : 3,        # number of epochs to train (default: 10)
    'LEARNING_RATE' : 1e-4,    # learning rate (default: 0.01)
    'SEED' : 42,               # random seed (default: 42)
    'MAX_LEN' : 512,
    'SUMMARY_LEN' : 150
    }

In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len

    def __len__(self):
        return len(self.data.summary)

    def __getitem__(self, index):
        fulltext = str(self.data.fulltext[index])
        fulltext = ' '.join(fulltext.split())

        summary = str(self.data.summary[index])
        summary = ' '.join(summary.split())

        source = self.tokenizer.batch_encode_plus([fulltext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([summary], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
# Creating the training function. This will be called in the main function. It is run depending on the epoch value.
# The model is put into train mode and then we enumerate over the training loader and passed to the defined network

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]

        if _%50==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [None]:
def validate(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(config['SEED']) # pytorch random seed
np.random.seed(config['SEED']) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [None]:
# Importing and Pre-Processing the domain data
df = pd.read_csv(path ,encoding='latin-1')

# Selecting the needed columns only and renaming to summary & fulltext.
df = df[[summary_col, fulltext_col]]
df = df.rename({summary_col: 'summary', fulltext_col: 'fulltext'}, axis='columns')
# Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task.
df.fulltext = 'summarize: ' + df.fulltext
# df = df.sample(frac=0.1)
print(df.head())

                                             summary                                           fulltext
0  The Administration of Union Territory Daman an...  summarize: The Daman and Diu administration on...
1  Malaika Arora slammed an Instagram user who tr...  summarize: From her special numbers to TV?appe...
2  The Indira Gandhi Institute of Medical Science...  summarize: The Indira Gandhi Institute of Medi...
3  Lashkar-e-Taiba's Kashmir commander Abu Dujana...  summarize: Lashkar-e-Taiba's Kashmir commander...
4  Hotels in Maharashtra will train their staff t...  summarize: Hotels in Mumbai and other Indian c...


In [None]:
# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest will be used for validation.
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = config['SEED'])
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, config['MAX_LEN'], config['SUMMARY_LEN'])
val_set = CustomDataset(val_dataset, tokenizer, config['MAX_LEN'], config['SUMMARY_LEN'])

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': config['TRAIN_BATCH_SIZE'],
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': config['VALID_BATCH_SIZE'],
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

FULL Dataset: (4514, 2)
TRAIN Dataset: (3611, 2)
TEST Dataset: (903, 2)


In [None]:
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(params =  model.parameters(), lr=config['LEARNING_RATE'])

# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(config['TRAIN_EPOCHS']):
  train(epoch, tokenizer, model, device, training_loader, optimizer)

torch.save(model.state_dict(), './data/t5_summary_state_dict.pt')

# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
predictions, actuals = validate(tokenizer, model, device, val_loader)
final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
final_df.to_csv('./data/predictions.csv')
print('Output Files generated for review')
print(final_df.head())

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  5.625776767730713
Epoch: 0, Loss:  2.256822109222412
Epoch: 0, Loss:  2.451709508895874
Epoch: 0, Loss:  1.7885276079177856
Epoch: 0, Loss:  2.0137434005737305
Epoch: 0, Loss:  1.7009224891662598
Epoch: 0, Loss:  1.5155736207962036
Epoch: 0, Loss:  1.5952929258346558
Epoch: 0, Loss:  1.7566555738449097
Epoch: 0, Loss:  1.631577730178833
Epoch: 0, Loss:  1.605650544166565
Epoch: 0, Loss:  1.6535598039627075
Epoch: 0, Loss:  1.7815755605697632
Epoch: 0, Loss:  1.117836833000183
Epoch: 0, Loss:  1.4934539794921875
Epoch: 0, Loss:  1.6505780220031738
Epoch: 0, Loss:  1.305673599243164
Epoch: 0, Loss:  1.7320071458816528
Epoch: 0, Loss:  2.0669302940368652
Epoch: 1, Loss:  1.7653343677520752
Epoch: 1, Loss:  2.2454097270965576
Epoch: 1, Loss:  1.557674765586853
Epoch: 1, Loss:  1.857395887374878
Epoch: 1, Loss:  1.2440931797027588
Epoch: 1, Loss:  1.5093859434127808
Epoch: 1, Loss:  2.156465530395508
Epoch: 1, Loss:  1.8368101119995117
Epoch: 1, Loss:  1.2443758249282837
Ep