## Load necessary packages

In [1]:
import json, ast, torch, random
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification
    # AdamW,
)
from sklearn.metrics import accuracy_score
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
print(torch.cuda.is_available())

True


## Load Data

In [6]:
import pandas as pd

file_path = "Data/PubMedQA_cleaned.json"
QA_data = pd.read_json(file_path)

In [4]:
QA_data

Unnamed: 0,id,context,question,options,gold_index
0,0,(Objective) We evaluated the usefulness of a s...,A short stay or 23-hour ward in a general and ...,"[No, Maybe, Yes]",2
1,1,(Methods) The records of 465 patients with an ...,Amblyopia: is visual loss permanent?,"[No, Maybe, Yes]",0
2,2,(Background) Radiotherapy reduces local recurr...,Does radiotherapy of the primary rectal cancer...,"[No, Maybe, Yes]",2
3,3,(Background) Pterygium is a disease of unknown...,Human papillomavirus and pterygium. Is the vir...,"[No, Maybe, Yes]",1
4,4,(Purpose) Reconstructing the natural joint lin...,Assessing joint line positions by means of the...,"[No, Maybe, Yes]",2
...,...,...,...,...,...
995,995,"(Background) ""America's Best Hospitals,"" an in...","Do ""America's Best Hospitals"" perform better f...","[No, Maybe, Yes]",2
996,996,(Background) Some patients with suspected comm...,The clinical significance of bile duct sludge:...,"[No, Maybe, Yes]",0
997,997,(Objective) To examine longitudinal patterns i...,Does obesity predict knee pain over fourteen y...,"[No, Maybe, Yes]",2
998,998,(Objectives) To assess Internet use amongst yo...,Can the Internet be used to improve sexual hea...,"[No, Maybe, Yes]",1


In [102]:
sample = QA_data.iloc[26].to_frame()
sample

Unnamed: 0,26
id,26
context,(Background) Several prospective randomized tr...
question,Are octogenarians at high risk for carotid end...
options,"[No, Maybe, Yes]"
gold_index,0


## Hugging Face Login

In [6]:
import os
from huggingface_hub import login

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Teacher Model

In [7]:
teacher_model_name = "Henrychur/MMed-Llama-3-8B"  
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)
teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model_name)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [8]:
teacher_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

## Build Prompt

In [None]:
def build_prompt(input):
    input_context = input.loc['context'].values[0]
    input_question = input.loc['question'].values[0]
    prompt = f"""Read an abstract from a PubMed paper and answer the question: {input_context}

Question: {input_question}
Instruction: Return ONLY three confidence scores over the three options ['No', 'Maybe', 'Yes'], DO NOT include any text output.
Format your answer strictly as: [score_no, score_maybe, score_yes].
DO NOT include any explanation or additional text. Only return the scores in the specified format.
Answer:
"""
    return prompt

In [None]:
prompt_text = build_prompt(sample)
print(prompt_text)

INSTRUCTION:
DO NOT include any explanation or additional text. Only return the one word answer from the three options: 'No', 'Maybe', or 'Yes'.

EXAMPLES:
Input: Read an abstract about X. Question: Does this support hypothesis Y?
Answer: Yes

Input: Read an abstract about Z. Question: Is this evidence inconclusive?
Answer: Maybe

TASK:
Read an abstract from a PubMed paper and answer the question: (Background) Several prospective randomized trials have proved carotid endarterectomy to be safe and effective for both symptomatic and asymptomatic patients younger than 80 years of age. Recently, carotid artery stenting (CAS) has been approved for use in selected high-risk patients. It has been proposed that being an octogenarian places patients in this high-risk category.
(Study design) All patients between the ages of 80 to 89 years undergoing carotid endarterectomy during a 12-year period were included in the study. Information included indications for carotid endarterectomy, associated 

## Run Model

In [11]:
teacher_model = teacher_model.half() 
teacher_model = teacher_model.to(device)

In [109]:
inputs = teacher_tokenizer(
    prompt_text,
    return_tensors="pt",
    # padding=True,
    # truncation=True,
).to(device)

In [81]:
# Teacher inference
with torch.no_grad():
    teacher_logits = teacher_model(**inputs)

In [82]:
teacher_logits

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 5.3828,  3.3457,  1.2578,  ..., -7.1992, -7.1992, -7.1992],
         [ 3.8145,  3.8008,  2.6875,  ..., -6.4961, -6.4961, -6.4961],
         [ 7.6992,  3.9766,  1.2461,  ..., -4.1719, -4.1719, -4.1719],
         ...,
         [ 6.9805,  9.3906,  9.8203,  ..., -5.4609, -5.4609, -5.4609],
         [ 7.0742,  4.0312,  5.1797,  ..., -4.2227, -4.2227, -4.2227],
         [ 6.4336,  9.4453,  9.8125,  ..., -5.3711, -5.3711, -5.3711]]],
       device='cuda:0', dtype=torch.float16), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7f3a0d76f490>, hidden_states=None, attentions=None)

In [110]:
# Generate from the model
teacher_out_ids = teacher_model.generate(
    **inputs, 
    eos_token_id=teacher_tokenizer.eos_token_id
    )

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [111]:
# Decode back to text
decoded = teacher_tokenizer.decode(teacher_out_ids[0], skip_special_tokens=False)
# decoded = decoded.split("Answer:")[1]
print(decoded)

Read an abstract from a PubMed paper and answer the question: (Background) Several prospective randomized trials have proved carotid endarterectomy to be safe and effective for both symptomatic and asymptomatic patients younger than 80 years of age. Recently, carotid artery stenting (CAS) has been approved for use in selected high-risk patients. It has been proposed that being an octogenarian places patients in this high-risk category.
(Study design) All patients between the ages of 80 to 89 years undergoing carotid endarterectomy during a 12-year period were included in the study. Information included indications for carotid endarterectomy, associated risk factors, length of stay, and hospital course. Perioperative morbidity and mortality, including neurologic events and myocardial infarction, were recorded.
(Results) A total of 103 carotid endarterectomies were performed in 95 octogenarians. Procedures were performed on 59 men and 36 women. Indications for operation included symptoma

## Calculate Accuracy

In [132]:
def eval_prompt(input):
    input_context = input.loc['context'].values[0]
    input_question = input.loc['question'].values[0]
    prompt = f"""INSTRUCTION:
DO NOT include any explanation or additional text. Only return the one word answer from the three options: 'No', 'Maybe', or 'Yes'.

EXAMPLES:
Input: Read an abstract about X. Question: Does this support hypothesis Y?
Answer: Yes

Input: Read an abstract about Z. Question: Is this evidence inconclusive?
Answer: Maybe

TASK:
Read an abstract from a PubMed paper and answer the question: {input_context}

Question: {input_question}
Answer:
"""
    return prompt

In [133]:
prompt = eval_prompt(sample)
print(prompt)

INSTRUCTION:
DO NOT include any explanation or additional text. Only return the one word answer from the three options: 'No', 'Maybe', or 'Yes'.

EXAMPLES:
Input: Read an abstract about X. Question: Does this support hypothesis Y?
Answer: Yes

Input: Read an abstract about Z. Question: Is this evidence inconclusive?
Answer: Maybe

TASK:
Read an abstract from a PubMed paper and answer the question: (Background) Several prospective randomized trials have proved carotid endarterectomy to be safe and effective for both symptomatic and asymptomatic patients younger than 80 years of age. Recently, carotid artery stenting (CAS) has been approved for use in selected high-risk patients. It has been proposed that being an octogenarian places patients in this high-risk category.
(Study design) All patients between the ages of 80 to 89 years undergoing carotid endarterectomy during a 12-year period were included in the study. Information included indications for carotid endarterectomy, associated 

In [134]:
# Tokenize the prompt
inputs = teacher_tokenizer(prompt, return_tensors="pt").to(teacher_model.device)

In [135]:
# Generate output from the teacher model
with torch.no_grad():
    teacher_out_ids = teacher_model.generate(
        **inputs,
        eos_token_id=teacher_tokenizer.eos_token_id
    )

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [136]:
# Decode the output
decoded_output = teacher_tokenizer.decode(teacher_out_ids[0], skip_special_tokens=True)

In [137]:
print(decoded_output)

INSTRUCTION:
DO NOT include any explanation or additional text. Only return the one word answer from the three options: 'No', 'Maybe', or 'Yes'.

EXAMPLES:
Input: Read an abstract about X. Question: Does this support hypothesis Y?
Answer: Yes

Input: Read an abstract about Z. Question: Is this evidence inconclusive?
Answer: Maybe

TASK:
Read an abstract from a PubMed paper and answer the question: (Background) Several prospective randomized trials have proved carotid endarterectomy to be safe and effective for both symptomatic and asymptomatic patients younger than 80 years of age. Recently, carotid artery stenting (CAS) has been approved for use in selected high-risk patients. It has been proposed that being an octogenarian places patients in this high-risk category.
(Study design) All patients between the ages of 80 to 89 years undergoing carotid endarterectomy during a 12-year period were included in the study. Information included indications for carotid endarterectomy, associated 

## Fine-Tune LoRA

In [2]:
from peft import LoraConfig, get_peft_model
from torch.utils.data import DataLoader

In [3]:
# Load model and tokenizer
model_name = "Henrychur/MMed-Llama-3-8B" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [4]:
model = model.half()

In [5]:
# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [6]:
import pandas as pd

file_path = "Data/PubMedQA_cleaned.json"
QA_data = pd.read_json(file_path)

In [7]:
# Preprocess the dataset
def preprocess_data(row):
    # Build the input prompt
    input_prompt = f"""Read an abstract from a PubMed paper and answer the question: {row['context']}

Question: {row['question']}
Answer:"""
    # The output is the gold answer (e.g., "No", "Maybe", or "Yes")
    output = row['options'][row['gold_index']]
    return input_prompt, output

# Apply preprocessing
processed_data = QA_data.apply(preprocess_data, axis=1)
inputs, outputs = zip(*processed_data)

# Assign eos_token as pad_token
tokenizer.pad_token = tokenizer.eos_token
# Tokenize the inputs and outputs
input_encodings = tokenizer(list(inputs), padding=True, truncation=True, return_tensors="pt")
output_encodings = tokenizer(list(outputs), padding=True, truncation=True, return_tensors="pt")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)



In [9]:
# Create a custom dataset
class QADataset(Dataset):
    def __init__(self, input_encodings, output_encodings):
        self.input_ids = input_encodings["input_ids"]
        self.attention_mask = input_encodings["attention_mask"]
        self.labels = output_encodings["input_ids"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

# Initialize the dataset and DataLoader
dataset = QADataset(input_encodings, output_encodings)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [13]:
# Clear CUDA cache
torch.cuda.empty_cache()

In [11]:
from torch.cuda.amp import autocast
from torch.amp import GradScaler
from torch.optim import AdamW
from tqdm import tqdm

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-4)

# Initialize GradScaler for mixed precision
scaler = GradScaler()

In [12]:
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): LlamaForCausalLM(
          (model): LlamaModel(
            (embed_tokens): Embedding(128256, 4096)
            (layers): ModuleList(
              (0-31): 32 x LlamaDecoderLayer(
                (self_attn): LlamaAttention(
                  (q_proj): lora.Linear(
                    (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=4096, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=4096, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (l

In [13]:
# Training loop
num_epochs = 3  
model.train()

for epoch in range(1, num_epochs + 1):  # Start epoch count from 1 for better readability
    print(f"Epoch {epoch}/{num_epochs}")
    loop = tqdm(dataloader, leave=True, desc=f"Training Epoch {epoch}")
    
    for step, batch in enumerate(loop, start=1):  # Add step count for better tracking
        # Move batch to GPU
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass with mixed precision
        with autocast():  # Enable mixed precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Backward pass with scaled loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        # Update progress bar with loss
        loop.set_postfix({
            "Step": step,
            "Loss": loss.item()
        })

    # Print epoch summary
    print(f"Epoch {epoch} completed. Final Loss: {loss.item():.4f}")

Epoch 1/3


  with autocast():  # Enable mixed precision
Training Epoch 1:   0%|          | 0/250 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 84.00 MiB. GPU 0 has a total capacity of 22.04 GiB of which 22.12 MiB is free. Including non-PyTorch memory, this process has 22.01 GiB memory in use. Of the allocated memory 21.54 GiB is allocated by PyTorch, and 247.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)