In [None]:
model_id = "microsoft/Phi-3-mini-128k-instruct"
output_model_path = "model_artifacts/Phi-3-mini-sardukar-dpo"

train_split: str = "train[30:]"
eval_split: str = "train[:30]"

## Load datasets

In [2]:
from datasets import load_dataset
from more_itertools import first

dataset_train =  load_dataset("sardukar/physiology-mcqa-8k", split=train_split)
dataset_eval = load_dataset("sardukar/physiology-mcqa-8k", split=eval_split)

def preprocess_dpo_data(sample: dict) -> dict:
    new_sample = {
        "prompt": [first(filter(lambda msg: msg["role"] == "user", sample["prompt"]))],
        "chosen": [first(filter(lambda msg: msg["role"] == "assistant", sample["chosen"]))],
        "rejected": [first(filter(lambda msg: msg["role"] == "assistant", sample["rejected"]))],
    }
    return new_sample

dataset_train = dataset_train.map(preprocess_dpo_data)
dataset_eval = dataset_eval.map(preprocess_dpo_data)

# Dataset is already in prompt / chosen / rejected format
display(dataset_train)
display(dataset_eval)

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 8800
})

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 30
})

## Finetuning with `DPOTrainer`

In [3]:
import os
import torch
from dotenv import load_dotenv
from huggingface_hub import login
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import DPOConfig, DPOTrainer

load_dotenv("../.env")
hf_token = os.environ["HF_TOKEN"]
print(hf_token[:8] + "*" * (len(hf_token) - 13) + hf_token[-5:])

login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


hf_AAlNN************************hCQBo


In [4]:
peft_config = LoraConfig(
    r=16,                       # ≤8 recommended for small models
    lora_alpha=32,              # 2*r recommended for DPO
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # target_modules=['o_proj', 'qkv_proj'] #phi-3
    target_modules="all-linear"
)


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
# tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    attn_implementation=("eager" if "gemma" in model_id else None),
)
model.config.use_cache = False

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
training_args = DPOConfig(
    # training args
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    remove_unused_columns=True,
    optim="paged_adamw_32bit",
    learning_rate=5.0e-06,
    bf16=True,
    num_train_epochs=1,
    lr_scheduler_type="cosine",
    warmup_steps=2,
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="epoch",
    output_dir=output_model_path,
    report_to="none",

    # DPO params
    beta=0.1,
    max_prompt_length=2048,
    max_length=2048,
)

In [7]:
trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    processing_class=tokenizer,
    peft_config=peft_config,
)

trainer.train()

Applying chat template to train dataset:   0%|          | 0/8800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/8800 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/30 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/30 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
100,0.1606,0.047525,2.825475,-3.90789,0.966667,6.733365,-367.076324,-153.204193,18.744724,11.917149
200,0.0414,0.039516,3.900815,-4.59355,0.966667,8.494366,-356.322876,-160.060806,18.797651,12.062738
300,0.0454,0.031118,4.823029,-4.061334,1.0,8.884363,-347.100739,-154.738632,18.799124,12.105014
400,0.0385,0.020127,5.179964,-4.35808,1.0,9.538044,-343.531433,-157.706085,18.813494,12.157629
500,0.0343,0.018343,5.631683,-4.200355,1.0,9.832037,-339.014191,-156.128876,18.752104,11.992146
600,0.0465,0.021722,5.67398,-4.590253,1.0,10.264233,-338.591248,-160.027832,18.786751,11.929935
700,0.0429,0.027271,5.627846,-5.416978,1.0,11.044826,-339.052551,-168.295105,18.819242,12.07717
800,0.0211,0.023972,5.58098,-5.842592,1.0,11.423571,-339.521179,-172.551224,18.823626,11.893851
900,0.0483,0.034623,5.666755,-6.099165,0.966667,11.765919,-338.663513,-175.116959,18.790663,11.796355
1000,0.0137,0.029784,5.9215,-5.487065,0.966667,11.408566,-336.116028,-168.995941,18.776817,11.840657


TrainOutput(global_step=2200, training_loss=0.04010157373818484, metrics={'train_runtime': 3122.1534, 'train_samples_per_second': 2.819, 'train_steps_per_second': 0.705, 'total_flos': 0.0, 'train_loss': 0.04010157373818484, 'epoch': 1.0})

In [8]:
trainer.model.save_pretrained("model_artifacts/temp")
tokenizer.save_pretrained("model_artifacts/temp")

('model_artifacts/temp/tokenizer_config.json',
 'model_artifacts/temp/special_tokens_map.json',
 'model_artifacts/temp/tokenizer.json')

In [9]:
import gc

try: del trainer
except: print("trainer not found")

try: del model
except: print("model not found")

gc.collect()
torch.cuda.empty_cache()

In [10]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    return_dict=True,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
from peft import PeftModel

model = PeftModel.from_pretrained(base_model, "model_artifacts/temp")
model = model.merge_and_unload()

In [12]:
model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)

('model_artifacts/Phi-3-mini-dpo/tokenizer_config.json',
 'model_artifacts/Phi-3-mini-dpo/special_tokens_map.json',
 'model_artifacts/Phi-3-mini-dpo/tokenizer.json')