In [None]:
!pip install transformers datasets accelerate bitsandbytes peft trl evaluate wandb -q

In [1]:
import os
import math
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import DPOConfig, DPOTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cuda


In [48]:
ds_comp = load_dataset("Intel/orca_dpo_pairs")
ds_comp
subset = ds_comp["train"].select(range(1000))

ds = subset.train_test_split(test_size=0.1, seed=42)
ds["valid"] = ds.pop("test")

ds["train"]

Dataset({
    features: ['system', 'question', 'chosen', 'rejected'],
    num_rows: 900
})

In [38]:
model_name = "unsloth/Qwen2.5-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token 

In [52]:
def format_chat_prompt(user_input, system_message=None, default_system="You are a helpful assistant."):
    parts = []
    sys_msg = (system_message or default_system or "").strip()
    if sys_msg:
        parts.append(f"<|im_start|>system\n{sys_msg}<|im_end|>\n")
    parts.append(f"<|im_start|>user\n{(user_input or '').strip()}<|im_end|>\n")
    parts.append("<|im_start|>assistant\n")  
    return "".join(parts)


In [53]:
def to_dpo_rows(ex):
    prompt = format_chat_prompt(
        user_input=ex.get("question", ""),
        system_message=ex.get("system", None),
    )
    return {
        "prompt": prompt,
        "chosen": (ex.get("chosen") or "").strip(),         
        "rejected": (ex.get("rejected") or "").strip(),      
    }

split = split.map(to_dpo_rows, remove_columns=[c for c in split["train"].column_names if c not in {"prompt","chosen","rejected"}])
print(split)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected', 'prompt'],
        num_rows: 900
    })
    valid: Dataset({
        features: ['chosen', 'rejected', 'prompt'],
        num_rows: 100
    })
})


In [55]:
ft_model_name = model_name.split('/')[1].replace("Instruct", "DPO")

training_args = DPOConfig(
    output_dir=ft_model_name,
    logging_steps=25,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    bf16=True,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="epoch",
    eval_strategy="epoch",
    eval_steps=1,
    report_to="none"
)

device = torch.device('cuda')


In [56]:
trainer = DPOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=split['train'],
    eval_dataset=split['valid'],
)
trainer.train()


Extracting prompt in train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,0.6561,0.63389,-0.018608,-0.143236,0.9,0.124627,-304.779175,-339.773468,-1.832836,-2.162092
2,0.5698,0.563502,-0.056189,-0.344709,0.95,0.288521,-305.154999,-341.788208,-1.828329,-2.15134
3,0.522,0.530544,-0.071664,-0.44442,0.95,0.372755,-305.309692,-342.785339,-1.826217,-2.147258




TrainOutput(global_step=339, training_loss=0.60055751997461, metrics={'train_runtime': 1469.2139, 'train_samples_per_second': 1.838, 'train_steps_per_second': 0.231, 'total_flos': 0.0, 'train_loss': 0.60055751997461, 'epoch': 3.0})

In [58]:
trainer.save_model("./qwen_dpo_orca")
ft_model = trainer.model

In [63]:
from transformers import pipeline

model_path = "./qwen_dpo_orca"
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

prompt = split["valid"][0]["prompt"]

inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

print("Prompt:\n", prompt)
print("\nModel Output:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))
print("\nChosen Reference:\n", example["chosen"])
print("\nRejected Reference:\n", example["rejected"])


Prompt:
 <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|im_end|>
<|im_start|>assistant


Model Output:
 system
You are a helpful assistant.
user

assistant
I'm here to help! What would you like assistance with?

Chosen Reference:
 1. South Africans were waiting for the announcement of election results in which the ruling African National Congress (ANC) was expected to win in a landslide, with ANC leader Jacob Zuma being the likely next president.
2. With one-third of the ballots counted, South African media reported that the ANC had garnered around 65% of the vote, while the opposition Democratic Alliance seemed poised to take control of Western Cape province.
3. Jacob Zuma addressed supporters at ANC headquarters in Johannesburg, cautioning against early celebrations but claiming the party to be like an "elephant" that could not be toppled.
4. Final election results were anticipated on Saturday, with manual vote counting being the norm in the country.
5. 

In [64]:
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 0.5305436849594116, 'eval_runtime': 20.4304, 'eval_samples_per_second': 4.895, 'eval_steps_per_second': 2.447, 'eval_rewards/chosen': -0.07166408747434616, 'eval_rewards/rejected': -0.4444195032119751, 'eval_rewards/accuracies': 0.949999988079071, 'eval_rewards/margins': 0.3727554380893707, 'eval_logps/chosen': -305.3096923828125, 'eval_logps/rejected': -342.78533935546875, 'eval_logits/chosen': -1.8262170553207397, 'eval_logits/rejected': -2.1472580432891846, 'epoch': 3.0}


In [66]:
from torch.nn import functional as F
import numpy as np

def score_prompt(model, tokenizer, prompt, response):
    text = prompt + response
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    return -loss  # امتیاز بالاتر بهتر است

scores = []
for ex in split["valid"].select(range(50)):  # روی ۵۰ نمونه تست می‌کنیم
    s_chosen = score_prompt(model, tokenizer, ex["prompt"], ex["chosen"])
    s_rejected = score_prompt(model, tokenizer, ex["prompt"], ex["rejected"])
    scores.append(s_chosen > s_rejected)

accuracy = np.mean(scores)
print(f"Preference accuracy on validation: {accuracy*100:.2f}%")


Preference accuracy on validation: 16.00%
