In [1]:
model_id = "microsoft/Phi-3-mini-128k-instruct"
output_model_path = "model_artifacts/Phi-3-mini-dpo-sardukar"

train_split: str = "train[300:]"
eval_split: str = "train[:300]"

## Load datasets

In [2]:
from datasets import load_dataset
from more_itertools import first

dataset_train =  load_dataset("sardukar/physiology-mcqa-8k", split=train_split)
dataset_eval = load_dataset("sardukar/physiology-mcqa-8k", split=eval_split)

def preprocess_dpo_data(sample: dict) -> dict:
    new_sample = {
        "prompt": [first(filter(lambda msg: msg["role"] == "user", sample["prompt"]))],
        "chosen": [first(filter(lambda msg: msg["role"] == "assistant", sample["chosen"]))],
        "rejected": [first(filter(lambda msg: msg["role"] == "assistant", sample["rejected"]))],
    }
    return new_sample

dataset_train = dataset_train.map(preprocess_dpo_data)
dataset_eval = dataset_eval.map(preprocess_dpo_data)

# Dataset is already in prompt / chosen / rejected format
display(dataset_train)
display(dataset_eval)

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 8530
})

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 300
})

## Finetuning with `DPOTrainer`

In [3]:
import os
import torch
from dotenv import load_dotenv
from huggingface_hub import login
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import DPOConfig, DPOTrainer

load_dotenv("../.env")
hf_token = os.environ["HF_TOKEN"]
print(hf_token[:8] + "*" * (len(hf_token) - 13) + hf_token[-5:])

login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


hf_AAlNN************************hCQBo


In [4]:
peft_config = LoraConfig(
    r=16,                       # ≤8 recommended for small models
    lora_alpha=32,              # 2*r recommended for DPO
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # target_modules=['o_proj', 'qkv_proj'] #phi-3
    target_modules="all-linear"
)


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
# tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    attn_implementation=("eager" if "gemma" in model_id else None),
)
model.config.use_cache = False

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
training_args = DPOConfig(
    # training args
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    remove_unused_columns=True,
    optim="paged_adamw_32bit",
    learning_rate=5.0e-06,
    label_smoothing_factor=0.1,
    bf16=True,
    num_train_epochs=1,
    lr_scheduler_type="cosine",
    warmup_steps=2,
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="epoch",
    output_dir=output_model_path,
    report_to="none",

    # DPO params
    beta=0.1,
    max_prompt_length=2048,
    max_length=2048,
)

In [7]:
trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    processing_class=tokenizer,
    peft_config=peft_config,
)

trainer.train()

Applying chat template to train dataset:   0%|          | 0/8530 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/8530 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
100,0.1545,0.065987,3.150877,-3.505524,0.96,6.6564,-362.174164,-147.673141,18.626652,11.920962
200,0.0535,0.061781,3.893556,-4.132151,0.956667,8.025707,-354.747375,-153.939407,18.694159,12.053234
300,0.0447,0.063764,4.152649,-4.487573,0.953333,8.640222,-352.156494,-157.493652,18.722061,12.079783
400,0.0527,0.058817,4.720963,-4.433333,0.963333,9.154296,-346.473297,-156.951218,18.685066,11.998507
500,0.0403,0.052966,5.405282,-3.568981,0.973333,8.974262,-339.630096,-148.307709,18.638611,11.704249
600,0.0351,0.048772,5.380713,-4.460271,0.98,9.840984,-339.875793,-157.220612,18.714254,11.873462
700,0.0254,0.048096,5.470206,-5.119447,0.976667,10.589653,-338.980896,-163.812363,18.741816,11.990218
800,0.0344,0.051967,5.353115,-5.6459,0.973333,10.999015,-340.151764,-169.076904,18.74119,11.971866
900,0.0298,0.049517,5.143085,-6.151467,0.973333,11.294552,-342.252106,-174.132584,18.726143,11.943316
1000,0.0255,0.048269,5.128032,-6.374474,0.973333,11.502508,-342.402618,-176.36264,18.733818,11.843888


TrainOutput(global_step=2132, training_loss=0.03905861152381432, metrics={'train_runtime': 3969.6542, 'train_samples_per_second': 2.149, 'train_steps_per_second': 0.537, 'total_flos': 0.0, 'train_loss': 0.03905861152381432, 'epoch': 0.9997655334114889})

In [8]:
trainer.model.save_pretrained("model_artifacts/temp")
tokenizer.save_pretrained("model_artifacts/temp")

('model_artifacts/temp/tokenizer_config.json',
 'model_artifacts/temp/special_tokens_map.json',
 'model_artifacts/temp/tokenizer.json')

In [9]:
import gc

try: del trainer
except: print("trainer not found")

try: del model
except: print("model not found")

gc.collect()
torch.cuda.empty_cache()

In [10]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    return_dict=True,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
from peft import PeftModel

model = PeftModel.from_pretrained(base_model, "model_artifacts/temp")
model = model.merge_and_unload()

In [12]:
model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)

('model_artifacts/Phi-3-mini-dpo-sardukar/tokenizer_config.json',
 'model_artifacts/Phi-3-mini-dpo-sardukar/special_tokens_map.json',
 'model_artifacts/Phi-3-mini-dpo-sardukar/tokenizer.json')