# Import

In [None]:
# import packages
import json
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay, classification_report
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, logging, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
import wandb

# Qwen baseline

In [None]:
# load data
def load_split(split_dir):
    files = glob.glob(os.path.join(split_dir, "*.json"))
    dfs = [pd.read_json(f) for f in files]
    return pd.concat(dfs, ignore_index=True)
train_df = load_split("train_splits")
test_df = load_split("test_splits")
train_df["answer"] = train_df["solution"].str.extract(r"<answer>\s*([A-Z])\s*</answer>")
test_df["answer"]  = test_df["solution"].str.extract(r"<answer>\s*([A-Z])\s*</answer>")

# run Qwen baseline
model_name = "Qwen/Qwen3-4B"
cache_path = "/labs/bozkurtlab/tricia-projects/huggingface"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", cache_dir=cache_path)

def run_qwen(question):
    prompt = f"Answer the following question, responding only with the correct answer letter (e.g. A, B, C, etc.)  and nothing else. Question: {question}"
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=3, do_sample=False, temperature=0.0, top_p=1.0, top_k=0)
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
    final = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
    return final

test_df["qwen_baseline_pred"] = test_df["problem"].apply(run_qwen)
test_df.to_csv("tricia_results.csv", index=False)

# Qwen fine-tune

In [None]:
# load data
print("Loading data...")
def load_split(split_dir):
    files = glob.glob(os.path.join(split_dir, "*.json"))
    dfs = [pd.read_json(f) for f in files]
    return pd.concat(dfs, ignore_index=True)
train_df = load_split("train_splits")
train_df["answer"] = train_df["solution"].str.extract(r"<answer>\s*([A-Z])\s*</answer>")
test_df = pd.read_csv("tricia_results.csv")

# fine-tuning
model_name = "Qwen/Qwen3-4B"
cache_path = "/labs/bozkurtlab/tricia-projects/huggingface"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_path)
def tokenize(row):
    prompt = f"Answer the following question, responding only with the correct answer letter (e.g. A, B, C, etc.)  and nothing else. Question: {row['problem']}"
    prompt_ids = tokenizer(prompt, truncation=True, padding="max_length", max_length=256, return_tensors="pt")["input_ids"][0]
    label_ids = tokenizer(row['answer'], truncation=True, padding="max_length", max_length=8, return_tensors="pt")["input_ids"][0]
    input_ids = torch.cat([prompt_ids, label_ids])
    labels = torch.cat([torch.full_like(prompt_ids, -100), label_ids])
    return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids), "labels": labels}
print("Tokenizing...")
train_hf, val_hf = train_test_split(train_df, test_size=0.20, random_state=42)
train_hf = Dataset.from_pandas(train_hf)
val_hf = Dataset.from_pandas(val_hf)
train_hf = train_hf.map(tokenize, batched=False)
val_hf = val_hf.map(tokenize, batched=False)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
output_dir = f"/labs/bozkurtlab/tricia-projects/bmi534_project/qwen3-lora-finetuned"
automodel = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    cache_dir=cache_path
)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(automodel, lora_config)
training_args = TrainingArguments(
    output_dir=output_dir,
    run_name="qwen3-lora-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=3e-4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=20,
    save_strategy="epoch",
    report_to="wandb"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hf,
    eval_dataset=val_hf,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# train
trainer.train()
model.save_pretrained(output_dir)

# test
def run_qwen(question):
    prompt = f"Answer the following question, responding only with the correct answer letter (e.g. A, B, C, etc.) and nothing else. Question: {question}"
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=3, do_sample=False, temperature=0.0, top_p=1.0, top_k=0)
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
    final = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
    return final
test_df["qwen_finetuned_pred"] = test_df["problem"].apply(run_qwen)
test_df.to_csv("tricia_results.csv", index=False)

# Tune learning rate (5-fold CV)

In [None]:
# load data
print("Loading data...")
def load_split(split_dir):
    files = glob.glob(os.path.join(split_dir, "*.json"))
    dfs = [pd.read_json(f) for f in files]
    return pd.concat(dfs, ignore_index=True)
train_df = load_split("train_splits")
train_df["answer"] = train_df["solution"].str.extract(r"<answer>\s*([A-Z])\s*</answer>")
test_df = pd.read_csv("tricia_results_clean.csv")

# methods
model_name = "Qwen/Qwen3-4B"
cache_path = "/labs/bozkurtlab/tricia-projects/huggingface"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_path)
def tokenize(row):
    prompt = f"Answer the following question, responding only with the correct answer letter (e.g. A, B, C, etc.)  and nothing else. Question: {row['problem']}"
    prompt_ids = tokenizer(prompt, truncation=True, padding="max_length", max_length=256, return_tensors="pt")["input_ids"][0]
    label_ids = tokenizer(row['answer'], truncation=True, padding="max_length", max_length=25, return_tensors="pt")["input_ids"][0]
    input_ids = torch.cat([prompt_ids, label_ids])
    labels = torch.cat([torch.full_like(prompt_ids, -100), label_ids])
    return {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids), "labels": labels}
def run_qwen(question, model):
    prompt = f"Answer the following question, responding only with the correct answer letter (e.g. A, B, C, etc.) and nothing else. Question: {question}"
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=3, do_sample=False, temperature=0.0, top_p=1.0, top_k=0)
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
    final = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
    return final

# 5-fold CV
learning_rates = [1e-4, 5e-5, 3e-5]
kfold = KFold(n_splits=5, shuffle=False)
cv_results = {}
saved_models = {}
test_preds = {}
for lr in learning_rates:
    cv_results[lr] = []
    saved_models[lr] = []
    test_preds[lr] = []
    for fold, (train_i, val_i) in enumerate(kfold.split(train_df, train_df["answer"])):
        print(f"Working on fold {fold} for learning rate {lr}...")
        train_hf = Dataset.from_pandas(train_df.iloc[train_i])
        val_hf = Dataset.from_pandas(train_df.iloc[val_i])
        train_hf = train_hf.map(tokenize, batched=False)
        val_hf = val_hf.map(tokenize, batched=False)
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        output_dir = f"/labs/bozkurtlab/tricia-projects/bmi534_project/qwen3-lr{lr}-fold{fold}"
        base_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16,
            cache_dir=cache_path
        )
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            r=16,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["q_proj", "v_proj"]
        )
        model = get_peft_model(base_model, lora_config)
        training_args = TrainingArguments(
            output_dir=output_dir,
            run_name=f"qwen3-lr{lr}-fold{fold}",
            per_device_train_batch_size=1,
            gradient_accumulation_steps=8,
            learning_rate=lr,
            num_train_epochs=1,
            fp16=True,
            logging_steps=20,
            save_strategy="epoch",
            report_to="wandb"
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_hf,
            eval_dataset=val_hf,
            tokenizer=tokenizer,
            data_collator=data_collator
        )
        trainer.train()
        model.save_pretrained(output_dir)
        eval_metrics = trainer.evaluate()
        wandb.finish()
        val_loss = eval_metrics["eval_loss"]
        cv_results[lr].append(val_loss)
        saved_models[lr].append(output_dir)
        print(f"Fold {fold} val loss: {val_loss:.4f}")

        # generate predictions
        fold_test_preds = [run_qwen(question, model) for question in test_df["problem"]]
        test_preds[lr].append(fold_test_preds)
        print("------------------------------------------")
        
        del model, base_model
        torch.cuda.empty_cache()
print("\n----------------Done with CV------------------\n")

# calculate best learning rate
mean_losses = {lr: np.mean(losses) for lr, losses in cv_results.items()}
print(f"Mean losses: {mean_losses}")
best_lr = min(mean_losses, key=mean_losses.get)
print(f"Selected learning rate: {best_lr}")

# save predictions
best_fold_preds = test_preds[best_lr]
for i, fold_preds in enumerate(best_fold_preds):
    test_df[f"fold_{i}_pred"] = fold_preds

# ensemble majority vote
fold_cols = ["fold_0_pred", "fold_1_pred", "fold_2_pred", "fold_3_pred", "fold_4_pred"]
test_df["ensemble_pred"] = test_df[fold_cols].mode(axis=1)[0]
test_df.to_csv("tricia_cv_results.csv", index=False)

# Make plots

In [None]:
# load data
combined_df = pd.read_csv("final_results.csv")

# plot accuracy by body system
model_cols = ["qwen3_llm", "qwen3_llm_finetuned", "qwen3_llm_ensemble", "qwen3_vlm", 
              "qwen3_vlm_finetuned", "qwen25_vlm", "qwen25_vlm_finetuned"]
rows = []
for model in model_cols:
    acc = (
        combined_df
        .groupby("body_system")
        .apply(lambda x: (x[model] == x["true_label"]).mean())
        .reset_index(name="accuracy")
    )
    acc["model"] = model
    rows.append(acc)
plot_df = pd.concat(rows, ignore_index=True)
pivot_df = plot_df.pivot(index="body_system", columns="model", values="accuracy")[model_cols]
pivot_df.plot(kind="bar", figsize=(10,4), width=0.8)
plt.ylabel("Accuracy")
plt.xlabel("Body System")
plt.legend(title='Model', bbox_to_anchor=(1.01, 1), loc='upper left')
plt.xticks(rotation=35, ha="right")
plt.tight_layout()
plt.savefig("accuracy_by_body_system.pdf", format="pdf")
plt.show()

In [None]:
# plot accuracy by medical task and question type
model_cols = ["qwen3_llm", "qwen3_llm_finetuned", "qwen3_llm_ensemble", "qwen3_vlm", 
              "qwen3_vlm_finetuned", "qwen25_vlm", "qwen25_vlm_finetuned"]
fig, axes = plt.subplots(1, 2, figsize=(10,4))
for i, col in enumerate(["medical_task", "question_type"]):
    rows = []
    for model in model_cols:
        acc = (
            combined_df
            .groupby(col)
            .apply(lambda x: (x[model] == x["true_label"]).mean())
            .reset_index(name="accuracy")
        )
        acc["model"] = model
        rows.append(acc)
    plot_df = pd.concat(rows, ignore_index=True)
    pivot_df = plot_df.pivot(index=col, columns="model", values="accuracy")[model_cols]
    pivot_df.plot(kind="bar", ax=axes[i], width=0.8, legend=False)
    axes[i].set_xlabel(col.replace("_", " ").title())
    axes[i].set_ylabel("Accuracy")
    axes[i].tick_params(axis='x', rotation=0)
plt.tight_layout()
plt.savefig("accuracy_other.pdf", format="pdf")
plt.show()

# Calculate metrics

In [None]:
final_label_df = pd.read_csv("combined_results.csv")
pred_cols = [c for c in final_label_df.columns if c.startswith("qwen")]
results = {}
for col in pred_cols:
    temp_df = final_label_df.dropna(subset=[col])
    y_true = temp_df["true_label"]
    y_pred = temp_df[col]
    cm = confusion_matrix(y_true, y_pred)    
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    results[col] = {
        "accuracy": report['accuracy'],
        "precision": report['macro avg']['precision'],
        "recall": report['macro avg']['recall'],
        "f1_macro": report['macro avg']['f1-score']
    }
metrics_df = pd.DataFrame(results).T
metrics_df