In [None]:
# Based on https://github.com/peremartra/Large-Language-Model-Notebooks-Course/blob/main/P2-MHF/Aligning_DPO_phi3.ipynb

MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL = "models/phi-3-mini-dpo"

## Load datasets

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
dataset_original =  load_dataset(
    "argilla/distilabel-capybara-dpo-7k-binarized",
    split='train[300:2500]'
)
dataset_eval = load_dataset(
    "argilla/distilabel-capybara-dpo-7k-binarized",
    split='train[:300]'
)

original_columns = dataset_original.column_names
dataset_original

Dataset({
    features: ['source', 'conversation', 'original_response', 'generation_prompt', 'raw_generation_responses', 'new_generations', 'prompt', 'chosen', 'rejected', 'rating_chosen', 'rating_rejected', 'chosen_model', 'rejected_model'],
    num_rows: 2200
})

In [5]:
def filter_data(dataset):
    return dataset.filter(
        lambda r: r["rating_chosen"]>=4.5 and r["rating_rejected"] <= 2.5
    ).map(
        lambda r: {"messages": len(r["chosen"])}
    ).filter(
        lambda r: (
            r["messages"] < 3
            and len(r["prompt"]) + len(r["chosen"]) + len(r["rejected"]) < 3800
        )
    )

dataset_filtered = filter_data(dataset_original)
dataset_filtered

Dataset({
    features: ['source', 'conversation', 'original_response', 'generation_prompt', 'raw_generation_responses', 'new_generations', 'prompt', 'chosen', 'rejected', 'rating_chosen', 'rating_rejected', 'chosen_model', 'rejected_model', 'messages'],
    num_rows: 169
})

In [6]:
dataset_eval_filtered = filter_data(dataset_eval)
dataset_eval_filtered

Dataset({
    features: ['source', 'conversation', 'original_response', 'generation_prompt', 'raw_generation_responses', 'new_generations', 'prompt', 'chosen', 'rejected', 'rating_chosen', 'rating_rejected', 'chosen_model', 'rejected_model', 'messages'],
    num_rows: 28
})

In [7]:
def chatml_format(example):
    # get everything except the last message as input
    prompt = tokenizer.apply_chat_template(
        example["chosen"][:-1],
        tokenize=False,
        add_generation_prompt=True
    )
    # get the last assistant responses
    chosen = example["chosen"][-1]["content"] + "<|end|>\n"
    rejected = example["rejected"][-1]["content"] + "<|end|>\n"

    return {
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

dataset = dataset_filtered.map(chatml_format, remove_columns=original_columns)
display(dataset)
dataset[12]

Dataset({
    features: ['prompt', 'chosen', 'rejected', 'messages'],
    num_rows: 169
})

{'prompt': '<|user|>\nSolve 36146684-304553543134. Only respond with math and no words.<|end|>\n<|assistant|>\n',
 'chosen': '36146684 - 304553543134 = -304517396450<|end|>\n',
 'rejected': '(36146684 / 3134) * (36146684 mod 3134) + (30455354 / 17) * (30455354 mod 17) = 11415845286790903\nAlternatively, using prime factorization and the Chinese Remainder Theorem:\n36146684 = 2^5 * 9573, 30455354 = 2 * 29 * 4171\n36146684 mod 9573 = 4332, 30455354 mod 29 = 13, 30455354 mod 4171 = 3965\n(36146684 / 9573) * 4332 + (30455354 / 29) * 13 + (30455354 / 4171) * 3965 = 11415845286790903<|end|>\n',
 'messages': 2}

In [8]:
dataset_eval = dataset_eval_filtered.map(chatml_format, remove_columns=original_columns)
display(dataset_eval)
dataset_eval[2]

Dataset({
    features: ['prompt', 'chosen', 'rejected', 'messages'],
    num_rows: 28
})

{'prompt': '<|user|>\nAssist me in calculating 9319357631 plus 595. Numbers and symbols only, please.<|end|>\n<|assistant|>\n',
 'chosen': 'The sum of 9319357631 and 595 is 9319358226.<|end|>\n',
 'rejected': 'The result of adding 9319357631 and 595 is 9319363626.<|end|>\n',
 'messages': 2}

## Finetuning with `DPOTrainer`

In [9]:
import torch
from peft import LoraConfig
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import DPOConfig, DPOTrainer

In [10]:
peft_config = LoraConfig(
    r=16,                       # ≤8 recommended for small models
    lora_alpha=32,              # 2*r recommended for DPO
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # target_modules=['o_proj', 'qkv_proj'] #phi-3
    target_modules="all-linear"
)


In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config
)
model.config.use_cache = False

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
training_args = DPOConfig(
    # training args
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    remove_unused_columns=True,
    optim="paged_adamw_32bit",
    learning_rate=5.0e-06,
    bf16=True,
    num_train_epochs=1,
    lr_scheduler_type="cosine",
    warmup_steps=2,
    logging_strategy="steps",
    logging_steps=1,
    # eval_strategy="steps",
    # eval_steps=20,
    save_strategy="epoch",
    output_dir=NEW_MODEL,
    report_to="none",

    # DPO params
    beta=0.1,
    max_prompt_length=2048,
    max_length=2048,
)

In [13]:
trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset_eval,
    processing_class=tokenizer,
    peft_config=peft_config,
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,0.6931
2,0.6931
3,0.7198
4,0.6627
5,0.6664
6,0.6684
7,0.7088
8,0.7251


KeyboardInterrupt: 