### Install Packages

In [5]:
!pip install "trl>=0.7.0" "datasets>=2.14.0" "torch>=2.0.0" --quiet
!pip install "accelerate>=0.24.0" "peft>=0.7.0" "trackio==0.5.0" --quiet
!pip install bitsandbytes --quiet
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m106.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB

### Imports

In [None]:
from kagglehub import model_download, KaggleDatasetAdapter, load_dataset
import os
from peft import LoraConfig, PeftModel
import torch
import trackio
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed, TrainingArguments
from trl import SFTTrainer, SFTConfig

from kaggle_secrets import UserSecretsClient
secret_label = "HF_TOKEN" # your huggingface token label in a Kaggle secret
HF_TOKEN = UserSecretsClient().get_secret(secret_label)

2025-11-03 10:32:04.527228: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762165924.738518      57 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762165924.807156      57 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Set Seed

In [7]:
SEED = 42

# Set seed for reproducibility
set_seed(SEED)

### Choose Device

In [8]:
if torch.cuda.is_available():
    device = "cuda"
    print(f"Using CUDA GPU: {torch.cuda.get_device_name()}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

else:
    device = "cpu"
    print("Using CPU - you will need to use a GPU to train models")

Using CUDA GPU: Tesla P100-PCIE-16GB
GPU memory: 17.1GB


### Load Dataset

In [9]:
DATASET_PATH = "bingxuanchia/dsa4213-medquad-processed-dataset"

# Load and verify dataset
medquad_train = load_dataset(
    KaggleDatasetAdapter.HUGGING_FACE,
    DATASET_PATH,
    "train.csv",
)

medquad_val = load_dataset(
    KaggleDatasetAdapter.HUGGING_FACE,
    DATASET_PATH,
    "val.csv",
)

medquad_test = load_dataset(
    KaggleDatasetAdapter.HUGGING_FACE,
    DATASET_PATH,
    "test.csv",
)

  medquad_train = load_dataset(
  medquad_val = load_dataset(
  medquad_test = load_dataset(


### Format Dataset

- Load pre-processed training set from Kaggle.
- Convert training set into a conversational format. This is a list of message pairs, where each pair consists of a question from the user and an answer from the assistant.

In [10]:
def format_qa_dataset(example, question_col, answer_col):
    """Formats QA datasets into chat format"""
    formatted_message_pairs = []
    
    for question, answer in zip(example[question_col], example[answer_col]):
        messages = [
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
        formatted_message_pairs.append(messages)
    
    return {"messages": formatted_message_pairs}

In [11]:
medquad_train_formatted = medquad_train.map(
    format_qa_dataset,
    batched=True,
    fn_kwargs={"question_col": "question", "answer_col": "answer"},
    remove_columns=medquad_train.column_names
)

medquad_val_formatted = medquad_val.map(
    format_qa_dataset,
    batched=True,
    fn_kwargs={"question_col": "question", "answer_col": "answer"},
    remove_columns=medquad_val.column_names
)

medquad_test_formatted = medquad_test.map(
    format_qa_dataset,
    batched=True,
    fn_kwargs={"question_col": "question", "answer_col": "answer"},
    remove_columns=medquad_val.column_names
)

Map:   0%|          | 0/11156 [00:00<?, ? examples/s]

Map:   0%|          | 0/1394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1396 [00:00<?, ? examples/s]

In [12]:
# 'messages' here are formatted QA pairs of user and 
# assistant messages for SFT
medquad_train_formatted["messages"][0]

[{'content': 'What are the treatments for Refsum Disease ?', 'role': 'user'},
 {'content': 'The primary treatment for ARD is to restrict or avoid foods that contain phytanic acid, including dairy products; beef and lamb; and fatty fish such as tuna, cod, and haddock. Some individuals may also require plasma exchange (plasmapheresis) in which blood is drawn, filtered, and reinfused back into the body, to control the buildup of phytanic acid.',
  'role': 'assistant'}]

### Fine-Tuning

#### Select Models

In [13]:
# Choose base model
# model_name = "HuggingFaceTB/SmolLM2-1.7B"

# Choose instruct model which will be used to make ChatTemplate
instruct_model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

# Set name of finetuned model
finetuned_model_name = "SmolLM2-1.7B-Instruct-MediLite-QA-Rank8-Quantized-HighLR"

#### Training Configurations

In [14]:
# Path which will store the saved weights
MODEL_WEIGHTS_PATH = "weights"
os.makedirs(MODEL_WEIGHTS_PATH, exist_ok=True)

In [None]:
# To conduct experiment tracking with trackio
# Dashboard viewable on Hugging Face Spaces
os.environ["HF_TOKEN"] = HF_TOKEN
os.environ["TRACKIO_SPACE_ID"] = "Jiahao123/MediLiteQA"
os.environ["TRACKIO_PROJECT"] = "medilite-finetuning"

HF_REPO_ID = f"Jiahao123/{finetuned_model_name}"

In [16]:
# Quantization configurations
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Configure training parameters
training_config = SFTConfig(
    # Model and data
    output_dir=os.path.join(MODEL_WEIGHTS_PATH, finetuned_model_name),
    dataset_text_field="messages",
    max_length=2048,
    chat_template_path=instruct_model_name,
    
    # Training hyperparameters
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    gradient_accumulation_steps=8,
    learning_rate=5e-5, # set higher learning rate
    num_train_epochs=5,
    seed=SEED,
    
    # Optimization
    warmup_steps=50,
    weight_decay=0.01,
    optim="adamw_torch",
    
    # Logging and saving
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    eval_strategy="epoch",
    load_best_model_at_end=True,
    
    # Memory optimization
    dataloader_num_workers=0,
    group_by_length=True,  # Group similar length sequences

    # Hugging Face Hub integration
    push_to_hub=True,  # Set to True to upload to Hub
    hub_model_id=HF_REPO_ID,

    # Experiment tracking
    report_to=["trackio"],
    run_name=finetuned_model_name,   # Set run name
)

print("Training configuration set!")
print(f"Effective batch size: {training_config.per_device_train_batch_size * training_config.gradient_accumulation_steps}")

print("Training configuration set!")
print(f"Effective batch size: {training_config.per_device_train_batch_size * training_config.gradient_accumulation_steps}")

In [None]:
# LoRA configurations
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

#### Load Model

#### Training

In [None]:
# Load base model
print(f"Loading {instruct_model_name}...")

model = AutoModelForCausalLM.from_pretrained(
    instruct_model_name,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(instruct_model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = "right"

print(f"Base model {instruct_model_name} loaded! Parameters: {model.num_parameters():,}")

In [None]:
# Create SFTTrainer with LoRA enabled
trainer = SFTTrainer(
    model=model,
    train_dataset=medquad_train_formatted,
    eval_dataset=medquad_val_formatted,
    args=training_config,
    peft_config=peft_config
)

print("Number of trainable parameters after LoRA:")
trainer.model.print_trainable_parameters()

In [None]:
print("Starting fine tuning...")
trainer.train()

Starting fine tuning...


The `TRACKIO_PROJECT` environment variable is deprecated and will be removed in a future version. Use TrainingArguments.project instead.
The `TRACKIO_SPACE_ID` environment variable is deprecated and will be removed in a future version. Use TrainingArguments.trackio_space_id instead.


* Trackio project initialized: medilite-finetuning
* Trackio metrics will be synced to Hugging Face Dataset: Jiahao123/MediLiteQA-dataset
* Found existing space: https://huggingface.co/spaces/Jiahao123/MediLiteQA
* View dashboard by going to: https://Jiahao123-MediLiteQA.hf.space/


* Created new run: SmolLM2-1.7B-Instruct-MediLite-QA-Rank8-Quantized-HighLR


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,1.2233,1.279015,1.256274,3172757.0,0.696983
2,1.0496,1.173237,1.137791,6345514.0,0.718449
3,0.9626,1.148381,1.127664,9518271.0,0.72256
4,0.8778,1.135756,1.116395,12691028.0,0.725041
5,0.8948,1.133617,1.116278,15863785.0,0.725572


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


* Run finished. Uploading logs to Trackio (please wait...)


TrainOutput(global_step=1745, training_loss=1.2323178169720495, metrics={'train_runtime': 39578.5132, 'train_samples_per_second': 1.409, 'train_steps_per_second': 0.044, 'total_flos': 1.54503750494208e+17, 'train_loss': 1.2323178169720495, 'epoch': 5.0})

In [None]:
trainer.push_to_hub()
tokenizer.push_to_hub(HF_REPO_ID)

#### Zip Outputs for Download

In [None]:
!zip -r finetuning_outputs.zip /kaggle/working

### Determine Testing Loss

In [17]:
HF_REPO_ID

'Jiahao123/SmolLM2-1.7B-Instruct-MediLite-QA-Rank8-Quantized-HighLR'

In [18]:
model = AutoModelForCausalLM.from_pretrained(
    instruct_model_name,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(instruct_model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load LoRA weights
model = PeftModel.from_pretrained(model, HF_REPO_ID)

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/839 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

In [19]:
os.environ["WANDB_DISABLED"] = "true"

# Configure training parameters
training_config = SFTConfig(
    # Model and data
    output_dir=os.path.join(MODEL_WEIGHTS_PATH, finetuned_model_name),
    dataset_text_field="messages",
    max_length=2048,
    chat_template_path=instruct_model_name,
    
    # Training hyperparameters
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    gradient_accumulation_steps=8,
    learning_rate=5e-5, # set higher learning rate
    num_train_epochs=5,
    seed=SEED,
    
    # Optimization
    warmup_steps=50,
    weight_decay=0.01,
    optim="adamw_torch",
    
    # Logging and saving
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    eval_strategy="epoch",
    load_best_model_at_end=True,
    
    # Memory optimization
    dataloader_num_workers=0,
    group_by_length=True,  # Group similar length sequences

    # Hugging Face Hub integration
    push_to_hub=False,  # Set to True to upload to Hub
    # hub_model_id=HF_REPO_ID,

    # Experiment tracking
    report_to=["none"],
    run_name=finetuned_model_name,   # Set run name
)

trainer = SFTTrainer(
    model=model,
    train_dataset=medquad_train_formatted,
    eval_dataset=medquad_test_formatted, # use test set as eval dataset
    args=training_config,
    # peft_config=peft_config
)

Tokenizing train dataset:   0%|          | 0/11156 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/11156 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1396 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1396 [00:00<?, ? examples/s]

In [20]:
trainer.evaluate()

{'eval_loss': 1.1401904821395874,
 'eval_model_preparation_time': 0.0024,
 'eval_runtime': 331.11,
 'eval_samples_per_second': 4.216,
 'eval_steps_per_second': 0.529,
 'eval_entropy': 1.1269219660758971,
 'eval_num_tokens': 0.0,
 'eval_mean_token_accuracy': 0.7233400535583496}