# Fine Tuning T5-small Model for Pubmed Dataset

##1. Install Transformers and Datasets from Hugging Face

In [1]:
# Transformers installation
! pip install -q transformers[torch] datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

##2. Load Dataset from Hugging Face

In [2]:
from datasets import load_dataset

dataset = load_dataset('ccdv/pubmed-summarization')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/779M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
# Examine the structure of dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6658
    })
})

In [4]:
# Subset the training, validation and test sets data
# Randomly pick 1000 rows for training, and 125 rows for both validation and testing
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(1000))
dataset["validation"] = dataset["validation"].shuffle(seed=42).select(range(125))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(125))

In [5]:
# Check the structure of dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 125
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 125
    })
})

##3. Preprocessing

In [6]:
# Define the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [7]:
dataset['test']

Dataset({
    features: ['article', 'abstract'],
    num_rows: 125
})

In [8]:
def preprocess_function(examples):
    # Attach the prefix "summarize: " to instruct the T5 model on the task it needs to perform
    inputs = ["summarize: " + doc for doc in examples['article']]

    # Tokenize the input texts to convert them into a format that can be fed into the T5 model.
    # Sets a maximum token length of 1024, and truncates any text longer than this limit.
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenizes the 'summary' field of the input examples to prepare the target labels for the summarization task.
    # Sets a maximum token length of 256, and truncates any text longer than this limit.
    labels = tokenizer(text_target=examples["abstract"], max_length=256, truncation=True)

    # Assigns the tokenized labels to the 'labels' field of model_inputs.
    # The 'labels' field is used during training to calculate the loss and guide model learning.
    model_inputs["labels"] = labels["input_ids"]

    # Returns the prepared inputs and labels as a single dictionary, ready for training.
    return model_inputs

In [9]:
# apply the preprocessing function over the entire dataset and set batched to True to process multiple elements of the dataset at once.
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

##4. Create Batches Using Data Collator

In [10]:
# Create a batch of examples
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

##5. Define Evaluation Metrics for Training

In [11]:
! pip install -q evaluate rouge_score

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [12]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [13]:
import numpy as np

def compute_metrics(eval_pred):
    # Unpacks the evaluation predictions tuple into predictions and labels.
    predictions, labels = eval_pred

    # Decodes the tokenized predictions back to text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replaces any -100 values in labels with the tokenizer's pad_token_id.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decodes the tokenized labels back to text, skipping any special tokens (e.g., padding tokens).
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculates the length of each prediction by counting the non-padding tokens.
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Computes the mean length of the predictions and adds it to the result dictionary under the key "gen_len".
    result["gen_len"] = np.mean(prediction_lens)

    # Rounds each value in the result dictionary to 4 decimal places for cleaner output, and returns the result.
    return {k: round(v, 4) for k, v in result.items()}


##6. Train

In [14]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [15]:
# Load the T5-small model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [16]:
# Define training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_t5_small_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16 = True,
)

In [17]:
# Pass the arguments to Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [18]:
# Finetune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.974902,0.0946,0.0326,0.0772,0.0772,19.0
2,No log,2.826103,0.0908,0.0293,0.0751,0.0752,19.0
3,No log,2.783303,0.0905,0.0303,0.0746,0.0745,19.0
4,No log,2.772537,0.0908,0.0311,0.075,0.0748,19.0




TrainOutput(global_step=252, training_loss=3.386540004185268, metrics={'train_runtime': 281.3909, 'train_samples_per_second': 14.215, 'train_steps_per_second': 0.896, 'total_flos': 1082734411776000.0, 'train_loss': 3.386540004185268, 'epoch': 4.0})

##7. Save the Model

In [19]:
trainer.save_model("my_fine_tuned_t5_small_model")

##8. Use the Fine-Tuned Model to Summarize Text

In [20]:
from transformers import pipeline

summarizer = pipeline("summarization", model = "my_fine_tuned_t5_small_model")

In [21]:
# Split the test data to 1. article to summarise (texts) 2. reference summary (target)
texts = dataset['test']['article']
target = dataset['test']['abstract']

In [23]:
# Import tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("my_fine_tuned_t5_small_model")

In [24]:
# Import model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("my_fine_tuned_t5_small_model")

In [25]:
import pandas as pd
# Initialize an empty list to store the summaries
summaries = []

# Loop through each text in the column
for text in texts:
    # Tokenize the text
    tokens_input = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True)

    # Generate summary
    summary_ids = model.generate(tokens_input, min_length=30, max_length=512)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Append the summary to the list of summaries
    summaries.append(summary)


In [26]:
# Store results
results = {
    'abstract': texts,
    'model summary': summaries
}
results = pd.DataFrame(results)

In [27]:
results.to_csv('pubmed_summary.csv', index = False)

In [28]:
results.to_excel('pubmed_summary.xlsx', index = False)

In [29]:
results.head()

Unnamed: 0,abstract,model summary
0,the birth of a premature infant has long been ...,the birth of a premature infant has long been ...
1,"de ridder , vanneste , and focquaert address c...","de ridder, vanneste, and focquaert address con..."
2,it is not easy to define a good health care ...,the health care system in albania has undergon...
3,the prosite database uses two kinds of signatu...,the prosite database uses two types of signatu...
4,although about two - thirds of patients with e...,despite the introduction of new aeds with a be...


##9. Use Metrics to Evaluate Results

In [30]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


In [32]:
predictions = list(results['model summary'])
references = target

In [33]:
# Compute ROUGE score
rouge = evaluate.load("rouge")
results_rouge = rouge.compute(predictions = predictions, references = references)

print(results_rouge)

{'rouge1': 0.23811848711266545, 'rouge2': 0.07548098970548478, 'rougeL': 0.15559382659774634, 'rougeLsum': 0.20485265446747591}


In [34]:
# Compute Bertscore
bertscore = evaluate.load("bertscore")
results_bert = bertscore.compute(predictions = predictions, references = references, lang = "en")
bert_precision = results_bert['precision']
bert_recall = results_bert['recall']
bert_f1 = results_bert['f1']
results_bert_mean = {'BERTScore Precision': round(sum(bert_precision)/len(bert_precision),4), 'BERTScore Recall': round(sum(bert_recall)/len(bert_recall),4), 'BERTScore F1': round(sum(bert_f1)/len(bert_f1),4)}
print(results_bert_mean)

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BERTScore Precision': 0.8338, 'BERTScore Recall': 0.7984, 'BERTScore F1': 0.8153}


In [35]:
# Modify the format of references
references_bleu = [[reference] for reference in references]

# Compute BLEU score
bleu = evaluate.load("bleu")
results_bleu = bleu.compute(predictions=predictions, references=references_bleu)

print(results_bleu['bleu'])

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

0.043039215698005714


In [36]:
results_df = pd.DataFrame({
    'Model':'T5',
    'ROUGE-1': [round(results_rouge['rouge1'],4)],
    'ROUGE-2': [round(results_rouge['rouge2'],4)],
    'ROUGE-L': [round(results_rouge['rougeL'],4)],
    'ROUGE-Ls': [round(results_rouge['rougeLsum'],4)],
    'BERT-Precision': [results_bert_mean['BERTScore Precision']],
    'BERT-Recall': [results_bert_mean['BERTScore Recall']],
    'BERT-F1': [results_bert_mean['BERTScore F1']],
    'BLEU Score': [round(results_bleu['bleu'],4)],
})
results_df.set_index('Model', inplace = True)

In [37]:
results_df

Unnamed: 0_level_0,ROUGE-1,ROUGE-2,ROUGE-L,ROUGE-Ls,BERT-Precision,BERT-Recall,BERT-F1,BLEU Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
T5,0.2381,0.0755,0.1556,0.2049,0.8338,0.7984,0.8153,0.043
