In [2]:
from unsloth import FastModel
import torch
import torch.nn as nn
from datasets import load_dataset
import re
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM,
    TextStreamer
)
from typing import Dict, List
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm
from datasets import Dataset as HFDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import datetime
import time

from transformers import TrainingArguments
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-24 00:14:38 [__init__.py:256] Automatically detected platform cuda.


In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [3]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
dataset = load_dataset("stanfordnlp/shp",  split='train[:1%]')
dataset

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Dataset({
    features: ['post_id', 'domain', 'upvote_ratio', 'history', 'c_root_id_A', 'c_root_id_B', 'created_at_utc_A', 'created_at_utc_B', 'score_A', 'score_B', 'human_ref_A', 'human_ref_B', 'labels', 'seconds_difference', 'score_ratio'],
    num_rows: 3487
})

In [5]:
# Format Dataset for GRPO Trainer
dataset = dataset.map(lambda x: {
    "prompt" : x['history'],
    "chosen": x['human_ref_A'] if x['labels'] == 1 else x['human_ref_B'],
    "rejected": x['human_ref_A'] if x['labels'] == 0 else x['human_ref_B']
})
dataset[0]

{'post_id': 'himc90',
 'domain': 'askacademia_train',
 'upvote_ratio': 0.99,
 'history': 'In an interview right before receiving the 2013 Nobel prize in physics, Peter Higgs stated that he wouldn\'t be able to get an academic job today, because he wouldn\'t be regarded as productive enough. > By the time he retired in 1996, he was uncomfortable with the new academic culture. "After I retired it was quite a long time before I went back to my department. I thought I was well out of it. It wasn\'t my way of doing things any more. Today I wouldn\'t get an academic job. It\'s as simple as that. I don\'t think I would be regarded as productive enough."  Another interesting quote from the article is the following:  > He doubts a similar breakthrough could be achieved in today\'s academic culture, because of the expectations on academics to collaborate and keep churning out papers. He said: "It\'s difficult to imagine how I would ever have enough peace and quiet in the present sort of climate 

In [6]:
# Load in Gemma 3
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
max_prompt_length = 256
max_seq_length = 1024

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.3.17: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.8.1.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.999 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


In [7]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [8]:
PatchDPOTrainer()

In [None]:
dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 2,
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        learning_rate = 5e-6,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
    beta = 0.1,
    train_dataset = dataset,
    # eval_dataset = raw_datasets["test"],
    tokenizer = tokenizer,
    max_length = max_seq_length,
    max_prompt_length = max_prompt_length,
)

Applying chat template to train dataset (num_proc=12):   0%|          | 0/3487 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=12):   0%|          | 0/3487 [00:00<?, ? examples/s]

In [None]:
dpo_trainer.train()

In [None]:
# Save the model to a local directory
output_path = "gemma_dpo"
trainer.save_model(output_path)
print(f"Model saved to {output_path}")