Fine-tuned BioBERT

In [None]:
# Load datasets for QA

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
import pandas as pd

model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

label_map = {"yes": 1, "no": 0}

# Preprocessing
def preprocess_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

train_df = pd.read_excel("qa_train.xlsx")
val_df = pd.read_excel("qa_validation.xlsx")

train_df["label"] = train_df["final_decision"].map(label_map)
val_df["label"] = val_df["final_decision"].map(label_map)

train_dataset = Dataset.from_pandas(train_df[["question", "context", "label"]])
val_dataset = Dataset.from_pandas(val_df[["question", "context", "label"]])

tokenised_train = train_dataset.map(preprocess_function, batched=True)
tokenised_val = val_dataset.map(preprocess_function, batched=True)

In [None]:
# Find best hyperparameters

import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

learning_rates = [5e-6, 3e-5, 5e-5]
batch_sizes = [4, 8]

best_f1 = 0
best_args = None

for lr in learning_rates:
    for bs in batch_sizes:
        print(f"\n Trying lr={lr}, batch_size={bs}")

        training_args = TrainingArguments(
            output_dir=f"./biobert-classification-lr{lr}-bs{bs}",
            do_eval=True,
            save_strategy="no",
            learning_rate=lr,
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs=3,
            weight_decay=0.01,
            logging_steps=50,
            disable_tqdm=False
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenised_train,
            eval_dataset=tokenised_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        trainer.train()
        metrics = trainer.evaluate()

        val_f1 = metrics.get("eval_f1") or metrics.get("f1", 0)
        print(f"Validation F1 for lr={lr}, bs={bs}: {val_f1}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            best_args = (lr, bs)

print("Best hyperparameters:", best_args)

In [None]:
# Train BioBERT using the best hyperparameters

best_lr, best_bs = best_args

final_training_args = TrainingArguments(
    output_dir="./biobert-qa-final",
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    eval_steps=200,
    save_steps=400,
    logging_steps=50,
    save_total_limit=2,
    learning_rate=best_lr,
    per_device_train_batch_size=best_bs,
    per_device_eval_batch_size=best_bs,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

metrics = trainer.evaluate()

print(metrics)

In [None]:
# Test BioBERT on test dataset

import pandas as pd
from datasets import Dataset
from transformers import Trainer, TrainingArguments

test_df = pd.read_excel("qa_test.xlsx")
label_map = {"yes": 1, "no": 0}
test_df["label"] = test_df["final_decision"].map(label_map)

test_dataset = Dataset.from_pandas(test_df[["question", "context", "label"]])

tokenised_test = test_dataset.map(preprocess_function, batched=True) 

final_training_args = TrainingArguments(
    output_dir="./biobert-qa-final", 
    do_eval=True, 
    eval_strategy="no", 
    save_strategy="no", 
    learning_rate=best_lr, 
    per_device_train_batch_size=best_bs, 
    per_device_eval_batch_size=best_bs, 
    num_train_epochs=3, 
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=False, 
    metric_for_best_model="f1",
    greater_is_better=True
)

eval_trainer = Trainer(
    model=trainer.model, # use the trained BioBERT 
    args=final_training_args,
    tokenizer=tokenizer, 
    compute_metrics=compute_metrics, 
    eval_dataset=tokenised_test 
)

metrics = eval_trainer.evaluate()
print(metrics)

In [None]:
# Print samples of QA 

predictions = eval_trainer.predict(tokenised_test)
preds = np.argmax(predictions.predictions, axis=-1)

# Map labels back to 'Yes'/'No'
reverse_label_map = {1: "Yes", 0: "No"}

sample_indices = [498, 998, 1098]

for i in sample_indices:
    q = test_df.loc[i, "question"]
    c = test_df.loc[i, "context"]
    true_label = reverse_label_map[test_df.loc[i, "label"]]
    pred_label = reverse_label_map[preds[i]]
    print(f"\n=== Sample {i+2} ===")
    print(f"Question: {q}")
    print(f"Context: {c[:300]}...") # only print first 300 characters
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {pred_label}")