### Install Packages

In [1]:
!pip install "datasets>=2.14.0" "torch>=2.0.0" --quiet
!pip install bitsandbytes evaluate huggingface_hub transformers --quiet
!pip install -U peft --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m87.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

### Imports

In [2]:
from kagglehub import model_download, KaggleDatasetAdapter, load_dataset
from kaggle_secrets import UserSecretsClient
import copy
import evaluate
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, set_seed, TrainingArguments

from datasets import Dataset

2025-11-10 04:13:42.257515: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762748022.436645      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762748022.488805      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Set Seed

In [3]:
SEED = 42

### Choose Device

In [4]:
if torch.cuda.is_available():
    device = "cuda"
    print(f"Using CUDA GPU: {torch.cuda.get_device_name()}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

else:
    device = "cpu"
    print("Using CPU - you will need to use a GPU to train models")

Using CUDA GPU: Tesla P100-PCIE-16GB
GPU memory: 17.1GB


### Load Dataset

In [5]:
TEST_SET_PATH = "bingxuanchia/dsa4213-medquad-processed-dataset"

# Load and verify dataset
medquad_test = load_dataset(
    KaggleDatasetAdapter.HUGGING_FACE,
    TEST_SET_PATH,
    "test.csv",
)

  medquad_test = load_dataset(


In [6]:
# Convert to pandas for grouping
medquad_test_pd = medquad_test.to_pandas()

# Group by question and aggregate answers into lists
medquad_test_pd_grouped = medquad_test_pd.groupby("question")["answer"].apply(list).reset_index()

# Convert back to Hugging Face dataset
medquad_test = Dataset.from_pandas(medquad_test_pd_grouped)

### Configurations

In [7]:
hf_token = UserSecretsClient().get_secret("HF_TOKEN")
os.environ["HF_TOKEN"] = hf_token

### Helpers

In [8]:
def get_response_to_query(generator, question):
    set_seed(SEED)

    messages = [{"role": "user", "content": question}]
    output = generator(messages, max_new_tokens=256, return_full_text=False)[0]

    return output["generated_text"]

In [9]:
def generate_responses(batch, generator, max_new_tokens=256):
    # set seed before generation
    set_seed(42)
    # Format each question into a chat message list
    formatted_inputs = [[{"role": "user", "content": q}] for q in batch["question"]]
    
    # Generate batched outputs
    outputs = generator(
        formatted_inputs,
        max_new_tokens=max_new_tokens,
        return_full_text=False,
        batch_size=len(formatted_inputs)
    )
    
    # Extract the generated text from each result
    batch["responses"] = [out[0]["generated_text"] for out in outputs]
    return batch


def get_testset_with_responses(testset, model_names):
    results_dataset = copy.deepcopy(testset)

    for model_name in model_names:
        print(f"Generating responses for {model_name}...")
        # Quantization configurations
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        
        if model_name == "HuggingFaceTB/SmolLM2-1.7B-Instruct-Quantized":  
            quantized_model = AutoModelForCausalLM.from_pretrained(
                "HuggingFaceTB/SmolLM2-1.7B-Instruct",
                quantization_config=quantization_config,
            )

            tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
            
            generator = pipeline(
                "text-generation",
                model=quantized_model,
                tokenizer=tokenizer,
            )
        else:
            quantized_model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=quantization_config,
            )
            
            generator = pipeline(
                "text-generation",
                model=model_name,
                device='cuda'
            )

        # Use dataset.map for batched parallel processing
        test_with_responses = results_dataset.map(
            lambda batch: generate_responses(batch, generator),
            batched=True,
            batch_size=32,   # adjust for GPU memory
        )

        model_label = model_name.split("/")[-1]
        model_col_name = f"{model_label}_responses"
        results_dataset = test_with_responses.rename_column("responses", model_col_name)

    return results_dataset

### Response Generation

In [10]:
# Set model names
final_model_names = [
    "HuggingFaceTB/SmolLM2-1.7B-Instruct-Quantized",
    "Jiahao123/SmolLM2-1.7B-Instruct-MediLite-QA-Rank8-Quantized-HighLR", # Best finetuned model
    "Jiahao123/medilite-grpo-v1" # RL model
]

In [None]:
final_results_dataset = get_testset_with_responses(
    testset=medquad_test,
    model_names=final_model_names
)

# NOTE: uncomment to push when running
HF_REPO_ID = "Cowboygarage/MediLite-QA-Response-Evaluation"
final_results_dataset.push_to_hub(HF_REPO_ID)

Generating responses for HuggingFaceTB/SmolLM2-1.7B-Instruct-Quantized...


config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

Device set to use cuda:0


Map:   0%|          | 0/1383 [00:00<?, ? examples/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Generating responses for Jiahao123/SmolLM2-1.7B-Instruct-MediLite-QA-Rank8-Quantized-HighLR...


adapter_config.json:   0%|          | 0.00/839 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

Device set to use cuda


Map:   0%|          | 0/1383 [00:00<?, ? examples/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Generating responses for Jiahao123/medilite-grpo-v1...


adapter_config.json:   0%|          | 0.00/888 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/919 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/368 [00:00<?, ?B/s]

Device set to use cuda


Map:   0%|          | 0/1383 [00:00<?, ? examples/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/532 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/Cowboygarage/MediLite-QA-Response-Evaluation/commit/fa6be8b96e3abe41ce17285531e00080097c1b6d', commit_message='Upload dataset', commit_description='', oid='fa6be8b96e3abe41ce17285531e00080097c1b6d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Cowboygarage/MediLite-QA-Response-Evaluation', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Cowboygarage/MediLite-QA-Response-Evaluation'), pr_revision=None, pr_num=None)