# Academic Journal Summarization Modeling - T5 Model Baseline FineTuning

## Setup

In [None]:
!pip install -q sentencepiece

In [None]:
!pip install -q transformers

In [None]:
!pip install -q evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
# Standard libraries
import os
import pandas as pd
import numpy as np

# Datset
from datasets import Dataset

# For counting and checkpoints
import pytz
from datetime import datetime

# Modeling
import tensorflow as tf
import nltk
from nltk.tokenize import sent_tokenize

from sklearn.model_selection import train_test_split

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

import evaluate

In [None]:
# Colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Data

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/DATASCI266/envisci_articles.csv')
df = pd.read_csv('/content/drive/MyDrive/DATASCI266/envisci_articles_clean.csv')

In [None]:
# Split data
train_val, test = train_test_split(df, test_size=0.2, random_state=1)
train, val = train_test_split(train_val, test_size=0.25, random_state=1)

In [None]:
model_name = 't5_finetune_no_prompt'

# Output paths - Baseline
LOCAL_OUTPUT_CHECKPOINT_PATH = f'./drive/MyDrive/DATASCI266/model_checkpoints_t5/{model_name}'
LOCAL_OUTPUT_MODEL_PATH = f'./drive/MyDrive/DATASCI266/model_finetuned_t5_models/{model_name}'

In [None]:
# Initialize the T5 tokenizer and model

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
  model_inputs = tokenizer(examples['full_text'], max_length = 512, truncation=True)
  labels = tokenizer(examples['abstract'], max_length = 512, truncation = True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

tokenized_datasets_train = train.apply(lambda row: preprocess_function(row), axis=1)
tokenized_datasets_val = val.apply(lambda row: preprocess_function(row), axis=1)

In [None]:
tokenized_datasets_train = tokenized_datasets_train.to_list()
dataset_train = Dataset.from_list(tokenized_datasets_train)

tokenized_datasets_val = tokenized_datasets_val.to_list()
dataset_val = Dataset.from_list(tokenized_datasets_val)

In [None]:
# https://huggingface.co/learn/nlp-course/chapter7/5?fw=pt#summarization

import numpy as np
import evaluate
import nltk
nltk.download('punkt')

rouge_score = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
from transformers import Seq2SeqTrainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# Fine Tuning
# Params
learning_rate = 0.00005
batch_size = 4
per_device_eval_batch = 4
weight_decay = 0.01
save_total_limit = 3
num_epochs = 5
# logging_steps = len(lst_texts) // batch_size

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir=LOCAL_OUTPUT_CHECKPOINT_PATH,
   evaluation_strategy="epoch",
   learning_rate=learning_rate,
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=per_device_eval_batch,
   weight_decay=weight_decay,
   save_total_limit=save_total_limit,
   num_train_epochs=num_epochs,
   predict_with_generate=True,
  #  logging_steps=logging_steps,
   push_to_hub=False
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(LOCAL_OUTPUT_MODEL_PATH)



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.0536,2.758755,0.0886,0.0276,0.0719,0.0772
2,2.9639,2.740196,0.0891,0.0284,0.0725,0.0776
3,2.878,2.726604,0.0891,0.0288,0.0725,0.078
4,2.8836,2.721205,0.0883,0.0285,0.0718,0.077
5,2.8358,2.720796,0.0875,0.0284,0.0712,0.0764


