<a href="https://colab.research.google.com/github/ujjwalgarg/learn_hugging_face/blob/main/Fine_tune_a_pre_trained_BERT_model_on_the_PubMedQA_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers torch datasets pandas numpy scikit-learn



In [2]:
from datasets import load_dataset
load_dataset

In [3]:
dataset = load_dataset("pubmed_qa", "pqa_labeled")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 1000
    })
})


In [4]:
import pandas as pd

def preprocess(example):
    return {
        "text": f"{example['question']} {example['context']}",
        "label": {"yes": 0, "no": 1, "maybe": 2}[example["final_decision"]]
    }

data = dataset["train"].map(preprocess)
df = pd.DataFrame(data)
df.head()

Unnamed: 0,pubid,question,context,long_answer,final_decision,text,label
0,21645374,Do mitochondria play a role in remodelling lac...,{'contexts': ['Programmed cell death (PCD) is ...,Results depicted mitochondrial dynamics in viv...,yes,Do mitochondria play a role in remodelling lac...,0
1,16418930,Landolt C and snellen e acuity: differences in...,{'contexts': ['Assessment of visual acuity dep...,"Using the charts described, there was only a s...",no,Landolt C and snellen e acuity: differences in...,1
2,9488747,"Syncope during bathing in infants, a pediatric...",{'contexts': ['Apparent life-threatening event...,"""Aquagenic maladies"" could be a pediatric form...",yes,"Syncope during bathing in infants, a pediatric...",0
3,17208539,Are the long-term results of the transanal pul...,{'contexts': ['The transanal endorectal pull-t...,Our long-term study showed significantly bette...,no,Are the long-term results of the transanal pul...,1
4,10808977,Can tailored interventions increase mammograph...,{'contexts': ['Telephone counseling and tailor...,The effects of the intervention were most pron...,yes,Can tailored interventions increase mammograph...,0


In [5]:
df.iloc[0]['text']

"Do mitochondria play a role in remodelling lace plant leaves during programmed cell death? {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.', 'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in la

In [6]:
df.iloc[0]['question']

'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?'

In [7]:
df.iloc[0]['context']

{'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
  'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in late stages of PCD (LPCD). Window stage leaves were stained with the mitochondrial dye MitoT

In [8]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [11]:
tokenized_train

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 800
})

In [12]:
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [13]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,           # Short training
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,           # Standard for BERT fine-tuning
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"  # Disable WandB and other integrations
)



In [15]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [16]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.906826,0.62
2,No log,0.809399,0.66
3,No log,0.659816,0.77


TrainOutput(global_step=300, training_loss=0.9167625935872395, metrics={'train_runtime': 295.1528, 'train_samples_per_second': 8.131, 'train_steps_per_second': 1.016, 'total_flos': 631472202547200.0, 'train_loss': 0.9167625935872395, 'epoch': 3.0})

In [17]:
def predict(question):
    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    outputs = model(**inputs)
    pred = outputs.logits.argmax(-1).item()
    return {0: "Yes", 1: "No", 2: "Uncertain"}[pred]


# Test
print(predict("Does diabetes cause fatigue?"))

Yes


In [18]:
print(predict("Does eating raw fruits cause cancer?"))

Yes


In [19]:
print(predict("Does smoking everyday keep you healthy?"))

Yes


In [20]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import optuna
from transformers import Trainer, TrainingArguments

def objective(trial):
    args = TrainingArguments(
        output_dir=f"./results_trial{trial.number}",
        learning_rate=trial.suggest_float("lr", 1e-5, 5e-5, log=True),
        per_device_train_batch_size=trial.suggest_categorical("bs", [8, 16]),
        num_train_epochs=trial.suggest_int("epochs", 2, 4),
        warmup_steps=trial.suggest_int("warmup", 0, 200),
        weight_decay=trial.suggest_float("wd", 0.0, 0.1),
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="none"
    )
    trainer = Trainer(
        model=AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3),
        args=args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=lambda pred: {"accuracy": accuracy_score(pred.label_ids, pred.predictions.argmax(-1))}
    )
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_accuracy"]

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
print(f"Best Params: {study.best_params}, Accuracy: {study.best_value}")

[I 2025-03-08 21:04:44,586] A new study created in memory with name: no-name-70c05147-320e-441e-a39b-4755351fc167
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.905596,0.61
2,No log,0.513843,0.79
3,No log,0.438746,0.795
4,No log,0.41269,0.85


[I 2025-03-08 21:11:11,496] Trial 0 finished with value: 0.85 and parameters: {'lr': 4.407159932300221e-05, 'bs': 16, 'epochs': 4, 'warmup': 63, 'wd': 0.019312956444842745}. Best is trial 0 with value: 0.85.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
