In [1]:
import os

os.environ["HF_HOME"] = "/home/shared/.cache/huggingface"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/home/shared/.cache/huggingface/hub"

In [2]:
import json
import torch
import torch.nn.functional as F
from datasets import Dataset
from torch.optim import AdamW
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    default_data_collator
)
from torch.utils.data import DataLoader

# ✅ Step 1: Load soft labels from file
with open("soft_labels_finetuned_biogpt.json", "r") as f:
    soft_dataset = json.load(f)


In [None]:
import os
from huggingface_hub import HfApi

os.environ["HUGGINGFACE_HUB_TOKEN"] = ""  # token
api = HfApi(token=os.environ["HUGGINGFACE_HUB_TOKEN"])

In [6]:
# ✅ Step 2: Load PubMedBERT tokenizer and model with manual BERT specification

from transformers import RobertaTokenizer, RobertaForSequenceClassification

student_model_id = "michiyasunaga/BioLinkBERT-large"
student_tokenizer = RobertaTokenizer.from_pretrained(student_model_id)
student_model = RobertaForSequenceClassification.from_pretrained(student_model_id, num_labels=3)

OSError: Can't load tokenizer for 'michiyasunaga/BioLinkBERT-large'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'michiyasunaga/BioLinkBERT-large' is the correct path to a directory containing all relevant files for a RobertaTokenizer tokenizer.

In [None]:
# ✅ Step 3: Convert soft dataset to HF Dataset and tokenize
hf_dataset = Dataset.from_list(soft_dataset)

def tokenize_function(example):
    tokens = student_tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokens["labels"] = torch.tensor(example["soft_label"], dtype=torch.float)
    return tokens

tokenized_dataset = hf_dataset.map(tokenize_function, remove_columns=["input_text", "soft_label", "gold_index"])

In [None]:
# ✅ Step 4: Build DataLoader
train_loader = DataLoader(
    tokenized_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=default_data_collator
)

In [None]:
# ✅ Step 5: Training config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
student_model.to(device)

optimizer = AdamW(student_model.parameters(), lr=5e-5)
loss_fn = torch.nn.KLDivLoss(reduction="batchmean")

In [None]:
# ✅ Step 6: Distillation training loop
epochs = 3
student_model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

        loss = loss_fn(log_probs, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"✅ Epoch {epoch+1} - Avg Distillation Loss: {total_loss / len(train_loader):.4f}")