# Fine Tuning T5-base Model for Pubmed Dataset

##1. Install Transformers and Datasets from Hugging Face

In [1]:
# Install transformer
! pip install -q transformers[torch] datasets

##2. Load Dataset from Hugging Face

In [2]:
# Import data
from datasets import load_dataset

dataset = load_dataset('ccdv/pubmed-summarization')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/779M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
# Examine the structure of dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6658
    })
})

In [4]:
# Subset the training, validation and test sets data
# Randomly pick 1000 rows for training, and 125 rows for both validation and testing
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(1000))
dataset["validation"] = dataset["validation"].shuffle(seed=42).select(range(125))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(125))

In [5]:
# Check the structure of dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 125
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 125
    })
})

##3. Preprocessing

In [6]:
# Define the tokenizer: t5-base as tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-base")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    # Attach the prefix "summarize: " to instruct the T5 model on the task it needs to perform
    inputs = ["summarize: " + doc for doc in examples['article']]

    # Tokenise the input texts 
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenise the 'abstract' field of the inputs to prepare target labels
    labels = tokenizer(text_target=examples["abstract"], max_length=256, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

##4. Create Batches Using Data Collator

In [9]:
# Create a batch of examples
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-base")

2024-05-09 14:30:53.780767: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-09 14:30:53.780944: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-09 14:30:53.918882: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


##5. Define Evaluation Metrics for Training

In [10]:
! pip install -q evaluate rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [12]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode the tokenised predictions into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decodes the tokenised labels into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE scores between decoded predictions and decoded target 
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}


##6. Train

In [13]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [14]:
# Load the T5-small model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [15]:
# Define training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tuned_t5_small_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16 = True,
)

In [16]:
# Pass the arguments to Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [17]:
# Finetune the model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.970016,0.095,0.0324,0.0783,0.0782,19.0
2,No log,2.827505,0.0921,0.0306,0.0764,0.0764,19.0
3,No log,2.784288,0.0903,0.0307,0.0751,0.0749,19.0
4,No log,2.773257,0.0905,0.0306,0.0751,0.0751,19.0




TrainOutput(global_step=252, training_loss=3.378932650127108, metrics={'train_runtime': 319.8504, 'train_samples_per_second': 12.506, 'train_steps_per_second': 0.788, 'total_flos': 1082734411776000.0, 'train_loss': 3.378932650127108, 'epoch': 4.0})

##7. Save the Model

In [18]:
trainer.save_model("fine_tuned_t5_small_model")

##8. Use the Fine-Tuned Model to Summarize Text

In [19]:
# Split the test data to 1. article to summarise (texts) 2. reference summary (target)
texts = dataset['test']['article']
target = dataset['test']['abstract']

In [20]:
type(target)

list

In [21]:
# Import tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("fine_tuned_t5_small_model")

In [22]:
# Import model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("fine_tuned_t5_small_model")

In [23]:
import pandas as pd
# Initialise an empty list to store the summaries
summaries = []

# Loop through each text in the column
for text in texts:
    # Tokenise the text
    tokens_input = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True)

    # Generate summary
    summary_ids = model.generate(tokens_input, min_length=30, max_length=512)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Append the summary to the list of summaries
    summaries.append(summary)


In [24]:
# Store results
results = {
    'article': texts,
    'abstract': target,
    'model summary': summaries
}
results = pd.DataFrame(results, columns = ['article', 'abstract', 'model summary'])

In [25]:
results.to_csv('pubmed_summary.csv', index = False)

In [26]:
results.to_excel('pubmed_summary.xlsx', index = False)

In [27]:
results.head()

Unnamed: 0,article,abstract,model summary
0,the birth of a premature infant has long been ...,background : the purpose of this study was to ...,the birth of a premature infant has long been ...
1,"de ridder , vanneste , and focquaert address c...",our ( 2014 ) model for the regulation of cogni...,"de ridder, vanneste, and focquaert address con..."
2,it is not easy to define a good health care ...,"aim : to date , the available information rega...",the health care system in albania has undergon...
3,the prosite database uses two kinds of signatu...,the prosite database consists of a large colle...,the prosite database uses two types of signatu...
4,although about two - thirds of patients with e...,"brivaracetam ( brv ) , a high - affinity synap...",despite the introduction of new aeds with a be...


##9. Use Metrics to Evaluate Results

In [28]:
!pip install bert-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [29]:
predictions = list(results['model summary'])
references = target

In [30]:
# Compute ROUGE score
from datasets import load_metric
rouge = load_metric("rouge")
results_rouge = rouge.compute(predictions = predictions, references = references)

from typing import Dict, Any

def simplify_rouge_scores(rouge_scores: Dict[str, Any]) -> str:
    simplified_text = ""
    mean_rouge = {}
    for key, value in rouge_scores.items():
        # Extract low, mid, and high scores for each ROUGE metric
        low, mid, high = value.low, value.mid, value.high
        simplified_text += f"{key}: Precision ranges from {low.precision:.2%} to {high.precision:.2%}, "
        simplified_text += f"Recall ranges from {low.recall:.2%} to {high.recall:.2%}, "
        simplified_text += f"F1 Score ranges from {low.fmeasure:.2%} to {high.fmeasure:.2%}.\n"
        mean_rouge[f"{key}"] = [round(mid.precision,4), round(mid.recall,4), round(mid.fmeasure,4)]

    return simplified_text, mean_rouge

text, rouge_scores = simplify_rouge_scores(results_rouge)

  rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [31]:
# Compute Bertscore
bertscore = evaluate.load("bertscore")
results_bert = bertscore.compute(predictions = predictions, references = references, lang = "en")
bert_precision = results_bert['precision']
bert_recall = results_bert['recall']
bert_f1 = results_bert['f1']
results_bert_mean = {'BERTScore Precision': round(sum(bert_precision)/len(bert_precision),4), 'BERTScore Recall': round(sum(bert_recall)/len(bert_recall),4), 'BERTScore F1': round(sum(bert_f1)/len(bert_f1),4)}
print(results_bert_mean)

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BERTScore Precision': 0.8323, 'BERTScore Recall': 0.7979, 'BERTScore F1': 0.8142}


In [32]:
# Modify the format of references
references_bleu = [[reference] for reference in references]

# Compute BLEU score
bleu = evaluate.load("bleu")
results_bleu = bleu.compute(predictions=predictions, references=references_bleu)

print(results_bleu['bleu'])

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

0.04362714840156634


In [33]:
results_df = pd.DataFrame({
    'Model':'T5',
    'BERT-Precision': [results_bert_mean['BERTScore Precision']],
    'BERT-Recall': [results_bert_mean['BERTScore Recall']],
    'BERT-F1': [results_bert_mean['BERTScore F1']],
    'BLEU Score': [round(results_bleu['bleu'],4)],
})
results_df.set_index('Model', inplace = True)

In [34]:
# Show BERT and BLEU scores
print(results_df)

       BERT-Precision  BERT-Recall  BERT-F1  BLEU Score
Model                                                  
T5             0.8323       0.7979   0.8142      0.0436


In [35]:
# Show ROUGE scores (medians)
rouge_scores = pd.DataFrame(rouge_scores)
rouge_scores.index = ['Precision', "Recall", "F-Measure"]
print(rouge_scores)

           rouge1  rouge2  rougeL  rougeLsum
Precision  0.3985  0.1271  0.2644     0.3407
Recall     0.2028  0.0662  0.1327     0.1737
F-Measure  0.2360  0.0762  0.1536     0.2021


In [36]:
print(simplify_rouge_scores(results_rouge))

('rouge1: Precision ranges from 37.08% to 42.88%, Recall ranges from 18.13% to 22.75%, F1 Score ranges from 21.90% to 25.67%.\nrouge2: Precision ranges from 10.31% to 16.11%, Recall ranges from 5.10% to 8.63%, F1 Score ranges from 6.07% to 9.74%.\nrougeL: Precision ranges from 23.95% to 29.13%, Recall ranges from 11.59% to 15.29%, F1 Score ranges from 13.94% to 17.09%.\nrougeLsum: Precision ranges from 31.44% to 37.17%, Recall ranges from 15.46% to 19.57%, F1 Score ranges from 18.44% to 22.06%.\n', {'rouge1': [0.3985, 0.2028, 0.236], 'rouge2': [0.1271, 0.0662, 0.0762], 'rougeL': [0.2644, 0.1327, 0.1536], 'rougeLsum': [0.3407, 0.1737, 0.2021]})
