<a href="https://colab.research.google.com/github/yblee110/smol-course-modulabs/blob/main/study/yblee110/2_preference_alignment/notebooks/dpo_finetuning_example_yblee.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the requirements in Google Colab
!pip install transformers datasets trl huggingface_hub

# Authenticate to Hugging Face

from huggingface_hub import login

login()

# for convenience you can create an environment variable containing your hub token as HF_TOKEN

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-p

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig

In [3]:
# Load dataset

# TODO: 🦁🐕 change the dataset to one of your choosing
dataset = load_dataset(path="trl-lib/ultrafeedback_binarized", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/643 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/131M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/62135 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
# TODO: 🐕 If your dataset is not represented as conversation lists, you can use the `process_dataset` function to convert it.

In [5]:
# TODO: 🦁 change the model to the path or repo id of the model you trained in [1_instruction_tuning](../../1_instruction_tuning/notebooks/sft_finetuning_example.ipynb)

model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    torch_dtype=torch.float32,
).to(device)
model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "SmolLM2-FT-DPO"
finetune_tags = ["smol-course", "module_1"]

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

In [6]:
# Training arguments
training_args = DPOConfig(
    # Training batch size per GPU
    per_device_train_batch_size=4,
    # Number of updates steps to accumulate before performing a backward/update pass
    # Effective batch size = per_device_train_batch_size * gradient_accumulation_steps
    gradient_accumulation_steps=4,
    # Saves memory by not storing activations during forward pass
    # Instead recomputes them during backward pass
    gradient_checkpointing=True,
    # Base learning rate for training
    learning_rate=5e-5,
    # Learning rate schedule - 'cosine' gradually decreases LR following cosine curve
    lr_scheduler_type="cosine",
    # Total number of training steps
    max_steps=200,
    # Disables model checkpointing during training
    save_strategy="no",
    # How often to log training metrics
    logging_steps=1,
    # Directory to save model outputs
    output_dir="smol_dpo_output",
    # Number of steps for learning rate warmup
    warmup_steps=100,
    # Use bfloat16 precision for faster training
    bf16=True,
    # Disable wandb/tensorboard logging
    report_to="none",
    # Keep all columns in dataset even if not used
    remove_unused_columns=False,
    # Enable MPS (Metal Performance Shaders) for Mac devices
    use_mps_device=device == "mps",
    # Model ID for HuggingFace Hub uploads
    hub_model_id=finetune_name,
    # DPO-specific temperature parameter that controls the strength of the preference model
    # Lower values (like 0.1) make the model more conservative in following preferences
    beta=0.1,
    # Maximum length of the input prompt in tokens
    max_prompt_length=1024,
    # Maximum combined length of prompt + response in tokens
    max_length=1536,
)

In [7]:
trainer = DPOTrainer(
    # The model to be trained
    model=model,
    # Training configuration from above
    args=training_args,
    # Dataset containing preferred/rejected response pairs
    train_dataset=dataset,
    # Tokenizer for processing inputs
    processing_class=tokenizer,
    # DPO-specific temperature parameter that controls the strength of the preference model
    # Lower values (like 0.1) make the model more conservative in following preferences
    # beta=0.1,
    # Maximum length of the input prompt in tokens
    # max_prompt_length=1024,
    # Maximum combined length of prompt + response in tokens
    # max_length=1536,
)

Extracting prompt in train dataset:   0%|          | 0/62135 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/62135 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/62135 [00:00<?, ? examples/s]

In [8]:
trainer.train()

# Save the model
trainer.save_model(f"./{finetune_name}")

# Save to the huggingface hub if login (HF_TOKEN is set)
if os.getenv("HF_TOKEN"):
    trainer.push_to_hub(tags=finetune_tags)

Step,Training Loss
1,0.6931
2,0.6931
3,0.6866
4,0.6915
5,0.6926
6,0.6787
7,0.6847
8,0.6932
9,0.6966
10,0.7016


------

In [1]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!pip install transformers datasets trl huggingface_hub

import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-p

In [4]:
# Load dataset

# TODO: 🦁🐕 change the dataset to one of your choosing
dataset = load_dataset(path="kuotient/orca-math-korean-dpo-pairs", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/162M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/192848 [00:00<?, ? examples/s]

In [5]:
print(dataset['question'][0])

정국이 5위입니다. 정국보다 결승선을 먼저 통과한 사람의 수를 찾아보세요.


In [6]:
model_name = "google/gemma-2-2b-it"

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    torch_dtype=torch.float32,
).to(device)
model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "gemma2-FT-DPO"
finetune_tags = ["smol-course", "module_1"]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [7]:
def process_dataset(sample):
    question = sample['question']
    chosen = sample['chosen']
    rejected = sample['rejected']
    sample_chosen_messages = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": chosen},
    ]
    sample_rejected_messgaes = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": rejected},
    ]

    chosen_text = tokenizer.apply_chat_template(sample_chosen_messages,tokenize=False)
    rejected_text = tokenizer.apply_chat_template(sample_rejected_messgaes,tokenize=False)
    sample['chosen'] = chosen_text
    sample['rejected'] = rejected_text
    return sample

ds = dataset.map(process_dataset)

Map:   0%|          | 0/192848 [00:00<?, ? examples/s]

In [8]:
# Training arguments
training_args = DPOConfig(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=50,
    save_strategy="no",
    logging_steps=1,
    output_dir="smol_dpo_output",
    warmup_steps=100,
    bf16=True,
    report_to="none",
    remove_unused_columns=False,
    use_mps_device=device == "mps",
    hub_model_id=finetune_name,
    beta=0.1,
    max_prompt_length=1024,
    max_length=1536,
)

In [9]:
trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    processing_class=tokenizer,
)

Extracting prompt in train dataset:   0%|          | 0/192848 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/192848 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/192848 [00:00<?, ? examples/s]

In [10]:
trainer.train()

It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.73 GiB. GPU 0 has a total capacity of 39.56 GiB of which 5.13 GiB is free. Process 52139 has 34.42 GiB memory in use. Of the allocated memory 33.37 GiB is allocated by PyTorch, and 567.48 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)