**Check the GPU**

In [None]:
!nvidia-smi

**Select model and target language**

In [None]:
MODEL_TYPE = "google/long-t5-tglobal-base"
TARGET_LANG = "eng"
MAX_TOKEN_LEN = 4096
BATCH_SIZE = 4
SIZE_OF_TRAINING_SET = 0.2 # 1.0 === 100%

**Install BLEURT**

In [None]:
!git clone https://github.com/google-research/bleurt.git
!pip install -q ./bleurt

# !wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip .
# !unzip BLEURT-20.zip -d "/content/drive/MyDrive/Colab Notebooks/magistrska/models/bleurt-checkpoints"

# !rm BLEURT-20.zip

**Install packages from pip**

In [None]:
!pip install -q transformers
!pip install -qU pytorch-lightning
!pip install -q peft
!pip install -q torch
!pip install -q pandas
!pip install -q numpy
!pip install -q scikit-learn
!pip install -q rouge-score
!pip install -q spacy ftfy==4.4.3

**Import libraries**

In [None]:
import os
import json
import torch
import transformers

import numpy as np
import pandas as pd
import pytorch_lightning as pl

from bleurt import score
from rouge_score import rouge_scorer

from os.path import exists
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig, TaskType
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import AutoTokenizer, LongT5ForConditionalGeneration


**Set random seed**

In [None]:
pl.seed_everything(42)

**Load data to DataFrame**

In [None]:
DIR_PATH = "/content/drive/MyDrive/Colab Notebooks/magistrska/data"

def load_df_from_drive():
  return pd.read_csv(f"{DIR_PATH}/dataframe-kas.csv", encoding="utf-8")

In [None]:
print("Loading data from drive...")
df = load_df_from_drive()
print("Finished.")

**Remove any values that are null (there should be none) and reset index**

In [None]:
df = df.dropna()
df = df.reset_index(drop=True)

**Split into train, test and validation dataset (80 : 10 : 10)**

In [None]:
train_df, test_and_validation_df = train_test_split(df, train_size=0.8, shuffle=True, random_state=42)
test_df, validation_df = train_test_split(test_and_validation_df, test_size=0.5, shuffle=True, random_state=42)
train_df.shape, test_df.shape, validation_df.shape

del df

**Create class for dataset**

In [None]:
class SummaryDataset(Dataset):
  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer,
      text_max_token_len: int = MAX_TOKEN_LEN,
      summary_max_token_len: int = 256
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.text_max_token_len = text_max_token_len
    self.summary_max_token_len = summary_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    text_encoding = self.tokenizer(
        data_row["text"],
        max_length=self.text_max_token_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    summary_eng_encoding = self.tokenizer(
        data_row["abstract_eng"],
        max_length=self.summary_max_token_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    eng_labels = summary_eng_encoding["input_ids"]
    eng_labels[eng_labels == 0] = -100

    return {
        "text": data_row["text"],
        "text_input_ids": text_encoding["input_ids"].flatten(),
        "text_attention_mask": text_encoding["attention_mask"].flatten(),
        "summary_eng": data_row["abstract_eng"],
        "summary_eng_labels": eng_labels.flatten(),
        "summary_eng_attention_mask": summary_eng_encoding["attention_mask"].flatten()
    }

**Create summary data module**

In [None]:
class SummaryDataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      validation_df: pd.DataFrame,
      tokenizer,
      batch_size: int = 8,
      text_max_token_len: int = MAX_TOKEN_LEN,
      summary_max_token_len: int = 256
  ):
    super().__init__()

    self.train_df = train_df
    self.test_df = test_df
    self.validation_df = validation_df

    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.text_max_token_len = text_max_token_len
    self.summary_max_token_len = summary_max_token_len

  def setup(self, stage=None):
    self.train_dataset = SummaryDataset(
        self.train_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )

    self.test_dataset = SummaryDataset(
        self.test_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )

    self.validation_dataset = SummaryDataset(
        self.validation_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=False,
        pin_memory=True,
        prefetch_factor=2,
        persistent_workers=True,
        num_workers=11
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        shuffle=False,
        pin_memory=True,
        prefetch_factor=2,
        persistent_workers=True,
        num_workers=11
    )

  def val_dataloader(self):
    return DataLoader(
        self.validation_dataset,
        batch_size=self.batch_size,
        shuffle=False,
        pin_memory=True,
        prefetch_factor=2,
        persistent_workers=True,
        num_workers=11
    )

**Create model class**

In [None]:
class SummaryModel(pl.LightningModule):
  def __init__(self, model_type, target_lang):
    super().__init__()
    self.target_lang = target_lang
    self.model_type = model_type
    self.init_tokenizer(model_type)
    self.init_model(model_type)
    self.rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    self.bleurt = score.BleurtScorer("drive/MyDrive/Colab Notebooks/magistrska/models/bleurt-checkpoints/BLEURT-20")
    self.validation_step_outputs = {"generated": [], "ground_truths": []}
    self.validation_step_losses = []
    self.validation_scores = {"loss": []}
    self.test_step_outputs = {"generated": [], "ground_truths": []}
    self.test_scores = {"bleurt": [], "rougeL_fmeasure": [], "rougeL_precision": [], "rougeL_recall": []}
    self.save_hyperparameters()


  def init_tokenizer(self, model_type):
    self.tokenizer = AutoTokenizer.from_pretrained(model_type)


  def init_model(self, model_type):
    mod = LongT5ForConditionalGeneration.from_pretrained(model_type, return_dict=True)

    # for name, module in mod.named_modules():
    #   print(name)

    lora_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,  # Type of task (sequence-to-sequence language modeling)
        r=4,  # Rank of the low-rank matrices
        lora_alpha=16,  # LoRA alpha
        target_modules=[
            "q", "v"
        ],  # Layers to apply LoRA (e.g., query and value projection)
        lora_dropout=0.1,  # Dropout rate for LoRA
    )
    self.model = get_peft_model(mod, lora_config)


  def generate_summaries(self, batch):
    input_ids = batch["text_input_ids"]
    attention_mask = batch["text_attention_mask"]

    generated_ids = self.model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_length=256, # usually abstracts do not exceed 250 words
      num_beams=2,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True
    )

    predictions = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    return predictions


  def average_of_list(self, lst):
    return sum(lst) / len(lst)


  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
    output = self.model(
      input_ids,
      attention_mask=attention_mask,
      labels=labels,
      decoder_attention_mask=decoder_attention_mask
    )

    return output.loss, output.logits


  def training_step(self, batch, batch_index):
    loss, outputs = self._common_step(batch, batch_index)


    self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True, logger=True, batch_size=BATCH_SIZE)
    return loss


  def validation_step(self, batch, batch_index):
    loss, outputs = self._common_step(batch, batch_index)

    self.validation_step_losses.append(loss)

    self.log("validation_loss", loss, prog_bar=True, on_step=False, on_epoch=True, logger=True, batch_size=BATCH_SIZE)
    return loss


  def on_validation_epoch_end(self):
    self.validation_scores["loss"].append(self.average_of_list(self.validation_step_losses))
    self.validation_step_losses = []

  def test_step(self, batch, batch_index):
    loss, outputs = self._common_step(batch, batch_index)

    generated_predictions = self.generate_summaries(batch)

    self.test_step_outputs["generated"].extend(generated_predictions)
    self.test_step_outputs["ground_truths"].extend(batch[f"summary_{self.target_lang}"])

    self.log("test_loss", loss, prog_bar=True, on_step=False, on_epoch=True, logger=True, batch_size=BATCH_SIZE)
    return loss


  def on_test_epoch_end(self):
    generated = self.test_step_outputs["generated"]
    ground_truths = self.test_step_outputs["ground_truths"]

    bleurt_scores = self.bleurt.score(
        references=ground_truths,
        candidates=generated
      )
    avg_bleurt_score = self.average_of_list(bleurt_scores)

    rouge_scores_fmeasure = []
    rouge_scores_precision = []
    rouge_scores_recall = []
    for i in range(len(generated)):
      rouge_score = self.rouge.score(generated[i], ground_truths[i])["rougeL"]
      rouge_scores_fmeasure.append(rouge_score.fmeasure)
      rouge_scores_precision.append(rouge_score.precision)
      rouge_scores_recall.append(rouge_score.recall)

    avg_rouge_score_fmeasure = self.average_of_list(rouge_scores_fmeasure)
    avg_rouge_score_precision = self.average_of_list(rouge_scores_precision)
    avg_rouge_score_recall = self.average_of_list(rouge_scores_recall)

    self.test_scores["bleurt"] = bleurt_scores
    self.test_scores["rougeL_fmeasure"] = rouge_scores_fmeasure
    self.test_scores["rougeL_precision"] = rouge_scores_precision
    self.test_scores["rougeL_recall"] = rouge_scores_recall

    self.log("test_bleurt", avg_bleurt_score, prog_bar=True, on_step=False, on_epoch=True, logger=True, batch_size=BATCH_SIZE)
    self.log("test_rougeL_fmeasure", avg_rouge_score_fmeasure, prog_bar=True, on_step=False, on_epoch=True, logger=True, batch_size=BATCH_SIZE)
    self.log("test_rougeL_precision", avg_rouge_score_precision, prog_bar=True, on_step=False, on_epoch=True, logger=True, batch_size=BATCH_SIZE)
    self.log("test_rougeL_recall", avg_rouge_score_recall, prog_bar=True, on_step=False, on_epoch=True, logger=True, batch_size=BATCH_SIZE)


  def _common_step(self, batch, batch_index):
    input_ids = batch["text_input_ids"]
    attention_mask = batch["text_attention_mask"]
    labels = batch[f"summary_{self.target_lang}_labels"]
    labels_attention_mask = batch[f"summary_{self.target_lang}_attention_mask"]

    loss, outputs = self.forward(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_attention_mask=labels_attention_mask,
        labels=labels
    )

    return loss, outputs


  def configure_optimizers(self):
    return AdamW(self.parameters(), lr=5e-5)


**Init tokenizer, model and data module**

In [None]:
summary_model = SummaryModel(MODEL_TYPE, TARGET_LANG)
data_module = SummaryDataModule(train_df, test_df, validation_df, summary_model.tokenizer, batch_size=BATCH_SIZE)

**Make sure that the path to where the model will be saved exists in drive**

In [None]:
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/magistrska/models"
RESULTS_PATH = "/content/drive/MyDrive/Colab Notebooks/magistrska/results"

if not os.path.exists(f"{MODEL_PATH}/{summary_model.model_type}"):
    os.mkdir(f"{MODEL_PATH}/{summary_model.model_type}")

if not os.path.exists(f"{MODEL_PATH}/{summary_model.model_type}/to_{summary_model.target_lang}"):
  os.mkdir(f"{MODEL_PATH}/{summary_model.model_type}/to_{summary_model.target_lang}")

if not os.path.exists(f"{MODEL_PATH}/{summary_model.model_type}/to_{summary_model.target_lang}/percentage_{SIZE_OF_TRAINING_SET * 100}"):
  os.mkdir(f"{MODEL_PATH}/{summary_model.model_type}/to_{summary_model.target_lang}/percentage_{SIZE_OF_TRAINING_SET * 100}")

if not os.path.exists(f"{RESULTS_PATH}/{summary_model.model_type}"):
  os.mkdir(f"{RESULTS_PATH}/{summary_model.model_type}")

if not os.path.exists(f"{RESULTS_PATH}/{summary_model.model_type}/to_{summary_model.target_lang}"):
  os.mkdir(f"{RESULTS_PATH}/{summary_model.model_type}/to_{summary_model.target_lang}")

if not os.path.exists(f"{RESULTS_PATH}/{summary_model.model_type}/to_{summary_model.target_lang}/percentage_{SIZE_OF_TRAINING_SET * 100}"):
  os.mkdir(f"{RESULTS_PATH}/{summary_model.model_type}/to_{summary_model.target_lang}/percentage_{SIZE_OF_TRAINING_SET * 100}")

**Init trainer and save best model which has the lowest validation loss**

In [None]:
# stop training if validation loss is 3 times higher or equal to best validation loss
early_stop_callback = EarlyStopping(monitor="validation_loss", min_delta=0.00, patience=3, verbose=False, mode="min")

checkpoint_callback = ModelCheckpoint(
    dirpath=f"/content/drive/MyDrive/Colab Notebooks/magistrska/models/{summary_model.model_type}/to_{summary_model.target_lang}/percentage_{SIZE_OF_TRAINING_SET * 100}",
    filename='{epoch}-{validation_loss:.5f}',
    save_top_k=1,
    verbose=True,
    monitor="validation_loss",
    mode="min"
)

logger = TensorBoardLogger("lightning_logs", name=f"summary/{MODEL_TYPE}")

trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback, TQDMProgressBar(refresh_rate=10), early_stop_callback],
    accelerator="gpu",
    devices=1,
    limit_train_batches=SIZE_OF_TRAINING_SET,
    precision="bf16-mixed",
    gradient_clip_val=1.0,
    accumulate_grad_batches=16,
    num_sanity_val_steps=-1, # runs validation before training
    # max_epochs=1,
    check_val_every_n_epoch=1
)

torch.set_float32_matmul_precision("medium")

**If needed, reload the best previous model and continue training**

In [None]:
# BEST_MODEL_PATH = f"/{MODEL_PATH}/{MODEL_TYPE}/to_{TARGET_LANG}/percentage_{SIZE_OF_TRAINING_SET * 100}"

# summary_model = SummaryModel.load_from_checkpoint(f"{BEST_MODEL_PATH}/epoch=18-validation_loss=2.93664.ckpt")

**Train the model**

In [None]:
trainer.fit(summary_model, datamodule=data_module)

**Load best model from trainer (or drive)**

In [None]:
# looks like: /content/drive/MyDrive/Colab Notebooks/magistrska/models/t5-small/to_eng/percentage_20.0/epoch=0-validation_loss=2.63198-validation_bleurt=0.35159-validation_rougeL_fmeasure=0.18982.ckpt
BEST_MODEL_PATH = trainer.checkpoint_callback.best_model_path
# BEST_MODEL_PATH = f"/{MODEL_PATH}/{MODEL_TYPE}/to_{TARGET_LANG}/percentage_{SIZE_OF_TRAINING_SET * 100}/epoch=64-validation_loss=2.96666.ckpt"

if BEST_MODEL_PATH is None or BEST_MODEL_PATH == '':
  trained_model = summary_model
else:
  trained_model = SummaryModel.load_from_checkpoint(BEST_MODEL_PATH)

**Save results for validation during training epochs**

In [None]:
trained_model.validation_scores["loss"] = [x.item() for x in trained_model.validation_scores["loss"]]
df_with_logs = pd.DataFrame.from_dict(trained_model.validation_scores)
df_with_logs.to_csv(f"{RESULTS_PATH}/{trained_model.model_type}/to_{trained_model.target_lang}/percentage_{SIZE_OF_TRAINING_SET * 100}/validation_results.csv", encoding="utf-8", index=False)

**Test the model with the trainer**

In [None]:
# Freeze parameters for testing
trained_model.freeze()

trainer.test(trained_model, data_module)

**Save test DF to drive with results**

In [None]:
trainer_metrics = trainer.callback_metrics
trained_model_generated_test_outputs = trained_model.test_step_outputs["generated"]
trained_model_test_scores = trained_model.test_scores

data_module.test_df = data_module.test_df.drop("text", axis=1)

data_module.test_df[f"generated_summary_{trained_model.target_lang}"] = trained_model_generated_test_outputs
data_module.test_df["testing_bleurt_scores"] = trained_model_test_scores["bleurt"]
data_module.test_df["testing_rougeL_fmeasure_scores"] = trained_model_test_scores["rougeL_fmeasure"]
data_module.test_df["testing_rougeL_precision_scores"] = trained_model_test_scores["rougeL_precision"]
data_module.test_df["testing_rougeL_recall_scores"] = trained_model_test_scores["rougeL_recall"]
data_module.test_df["testing_average_bleurt_score"] = [trainer_metrics["test_bleurt"].item()] * len(trained_model_generated_test_outputs)
data_module.test_df["testing_average_rougeL_fmeasure_score"] = [trainer_metrics["test_rougeL_fmeasure"].item()] * len(trained_model_generated_test_outputs)
data_module.test_df["testing_average_rougeL_precision_score"] = [trainer_metrics["test_rougeL_precision"].item()] * len(trained_model_generated_test_outputs)
data_module.test_df["testing_average_rougeL_recall_score"] = [trainer_metrics["test_rougeL_recall"].item()] * len(trained_model_generated_test_outputs)

testing_filename_csv = f"{RESULTS_PATH}/{trained_model.model_type}/to_{trained_model.target_lang}/percentage_{SIZE_OF_TRAINING_SET * 100}/testing_results.csv"
data_module.test_df.to_csv(testing_filename_csv, encoding="utf-8", index=False)