<a href="https://colab.research.google.com/github/worachot-n/Text_summarization_T5/blob/main/1_5_5_Text_Summarization_Custom_Tech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Set Location Folder

In [None]:
import os
os.chdir('/content/drive/MyDrive/Kaggle')

# Install Library

In [None]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]
!pip install datasets rouge-score nltk
!pip install tensorboard
!pip install bert-score

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 41.8 MB/s eta 0:00:01[K     |▌                               | 20 kB 18.9 MB/s eta 0:00:01[K     |▉                               | 30 kB 15.8 MB/s eta 0:00:01[K     |█                               | 40 kB 14.3 MB/s eta 0:00:01[K     |█▍                              | 51 kB 6.5 MB/s eta 0:00:01[K     |█▋                              | 61 kB 7.7 MB/s eta 0:00:01[K     |██                              | 71 kB 8.6 MB/s eta 0:00:01[K     |██▏                             | 81 kB 8.3 MB/s eta 0:00:01[K     |██▍                             | 92 kB 9.3 MB/s eta 0:00:01[K     |██▊                             | 102 kB 7.7 MB/s eta 0:00:01[K     |███                             | 112 kB 7.7 MB/s eta 0:00:01[K     |███▎                            | 122 kB 7.7 MB/s eta 0:00:01[K     |███▌      

# Import Library

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os
from datasets import load_dataset, load_metric

from rich.table import Column, Table
from rich import box
from rich.console import Console

# Importing the T5 modules 
from transformers import T5Tokenizer, T5ForConditionalGeneration

%matplotlib inline

# Import Dataset With Categories

In [None]:
dataset_finetune = pd.read_csv('/content/drive/MyDrive/Kaggle/Dataset_Finetune_Mixed.csv')
dataset_evaluate = pd.read_csv('/content/drive/MyDrive/Kaggle/Dataset_Evaluate_Mixed.csv')

In [None]:
dataset_finetune["summaries"] = "summarize: "+dataset_finetune["summaries"]
dataset_evaluate["summaries"] = "summarize: "+dataset_evaluate["summaries"]

In [None]:
dataset_finetune.head()

Unnamed: 0.1,Unnamed: 0,articles,summaries,categories
0,0,Wal-Mart to pay $14m in gun suit The world's l...,"summarize: ""Although Wal-Mart has suspended gu...",business
1,1,S Korean consumers spending again South Korea ...,summarize: The country's economy has suffered ...,business
2,2,Orange colour clash set for court A row over t...,summarize: Orange claims the new low-cost mobi...,business
3,3,Survey confirms property slowdown Government f...,summarize: House prices were 11.8% higher on t...,business
4,4,UK homes hit Â£3.3 trillion total The value of...,summarize: More than a third of the UK's priva...,business


In [None]:
dataset_evaluate.head()

Unnamed: 0.1,Unnamed: 0,articles,summaries,categories
0,0,French boss to leave EADS The French co-head o...,summarize: The company should now be able put ...,business
1,1,WorldCom bosses' $54m payout Ten former direct...,"summarize: ""None of the 10 former directors wa...",business
2,2,Ebbers 'aware' of WorldCom fraud Former WorldC...,summarize: Mr Sullivan is at the centre of the...,business
3,3,Brazil jobless rate hits new low Brazil's unem...,summarize: Brazil's unemployment rate fell to ...,business
4,4,Ban on forced retirement under 65 Employers wi...,summarize: Trade and Industry Secretary Patric...,business


# Read Categories From Dataset

In [None]:
categories = np.unique(dataset_evaluate[['categories']].values).tolist()
print(categories)

['Mixed', 'business', 'entertainment', 'politics', 'sport', 'tech']


In [None]:
dataset_dict = {}
# print(type(dataset_dict))
for i, category in enumerate(categories):
  dataset_dict[category] = dataset_evaluate.loc[dataset_evaluate['categories'] == category]

# Console Logger

In [None]:
# define a rich console logger
console=Console(record=True)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)

# Use GPU

In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
class YourDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long),
    }

# Import Tensorboard

In [None]:
import torch
import torchvision
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms

In [None]:
# SummaryWriter takes log directory as argument
writer = SummaryWriter()

# Train & Validate Function

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer, finetune):

  """
  Function to be called for training with the parameters passed from main function

  """
  running_loss = 0.0
  model.train()
  for _,data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    if _%10==0:
      training_logger.add_row(str(epoch), str(_), str(loss))
      console.print(training_logger)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if _%10==0:
      writer.add_scalar(f'training loss {finetune}', running_loss / 10, epoch * len(loader) + _)
      running_loss = 0.0
  writer.close()

In [None]:
def validate(epoch, tokenizer, model, device, loader, max_lengths):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  texts = []
  with torch.no_grad():
      for i, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=max_lengths, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
          text = [tokenizer.decode(f, skip_special_tokens=True, clean_up_tokenization_spaces=True) for f in ids]
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]

          predictions.extend(preds)
          actuals.extend(target)
          texts.extend(text)
  return predictions, actuals, texts

# Set Model Parameters

In [None]:
model_params={
    "MODEL":"t5-base",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":4,          # training batch size
    "VALID_BATCH_SIZE":4,          # validation batch size
    "TRAIN_EPOCHS":50,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text (article)
    "MAX_TARGET_TEXT_LENGTH":
    {'Mixed': 189, 'business': 181, 'entertainment': 171, 'politics': 217, 'sport': 172, 'tech': 211},   # max length of target text (summarize)
    "SEED": 42                     # set seed for reproducibility 
}

In [None]:
def T5Trainer(dataframe, valid, finetune, categories, source_text, target_text, model_params, output_dir="./outputs/" ):
  
  """
  T5 trainer

  """

  # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(model_params["SEED"]) # pytorch random seed
  np.random.seed(model_params["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True

  # logging
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
  # Further this model is sent to device (GPU/TPU) for using the hardware.
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)
  
  # logging
  console.log(f"[Data]: Reading data...\n")

  # Importing the raw dataset
  dataframe_select = dataframe.loc[dataset_finetune['categories'] == finetune]
  max_length = int(model_params["MAX_TARGET_TEXT_LENGTH"][finetune])

  # Creation of Dataset and Dataloader
  train_size = 1
  train_dataset = dataframe_select.sample(frac=train_size,random_state = model_params["SEED"]).reset_index(drop=True)
  val_dataset_dict = {}
  for i, category in enumerate(categories):
    dataset_dict[category] = dataset_evaluate.loc[dataset_evaluate['categories'] == category]
    val_dataset_dict[category] = dataset_dict[category].sample(frac=train_size).reset_index(drop=True)

  console.print(f"FULL Dataset: {dataframe_select.shape}")
  console.print(f"TRAIN Dataset {finetune}: {train_dataset.shape}")
  for i, category in enumerate(categories):
    console.print(f"TEST Dataset {category}: {val_dataset_dict[category].shape}")
  console.print("\n")

  console.print(f'[MAX_TARGET_TEXT_LENGTH ({finetune})] = {max_length}...\n')

  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], max_length, source_text, target_text)
  
  val_set_dict = {}
  for i, category in enumerate(categories):
    val_set_dict[category] = YourDataSetClass(val_dataset_dict[category], tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], max_length, source_text, target_text)

  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }

  val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }

  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  training_loader = DataLoader(training_set, **train_params)
  val_loader_dict = {}
  for i, category in enumerate(categories):
    val_loader_dict[category] = DataLoader(val_set_dict[category], **val_params)

  # Defining the optimizer that will be used to tune the weights of the network in the training session. 
  optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])

  # Training loop
  console.log(f'[Initiating Fine Tuning]...\n')

  epochs = model_params["TRAIN_EPOCHS"]
  for epoch in range(model_params["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, training_loader, optimizer, finetune)
  console.log(f"[Train {epochs} epochs with {finetune} is Completed.]\n")
      
  console.log(f"[Saving Model]...\n")
  # Saving the model after training
  path = os.path.join(output_dir, f"model_files_{finetune}")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)

  # Evaluating test dataset
  console.log(f"[Initiating Validation]...\n")

  final_df_dict = {}
  for i, category in enumerate(categories):
    for epoch in range(model_params["VAL_EPOCHS"]):
      predictions, actuals, texts = validate(epoch, tokenizer, model, device, val_loader_dict[category], max_length)
      actuals_clean = [s.replace("summarize: ", "") for s in actuals]
      final_df_dict[category] = pd.DataFrame({'Full Text':texts, 'Generated Text':predictions, 'Actual Text':actuals_clean, 'Finetune':finetune, 'Category':category})
      final_df_dict[category].to_csv(os.path.join(output_dir,f'predictions_{finetune}_{category}.csv'), index=False)
    console.log(f"[Validation with {category} is Completed.]\n")
  
  console.save_text(os.path.join(output_dir,f'logs_{finetune}_finetune.txt'))

# Start Training & Evaluation

In [None]:
T5Trainer(dataframe=dataset_finetune, valid=dataset_dict, finetune='tech', categories=categories, source_text="articles", target_text="summaries", model_params=model_params, output_dir="./outputs/")

# Show Train Loss on Tensorboard

In [None]:
# %load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir runs

In [None]:
# import torch
# torch.cuda.empty_cache()