# Load Preference Dataset

In [12]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, DatasetDict
from torch.utils.data import DataLoader
from torch import amp
import os
import random

# Set random seed for reproducibility
random.seed(42)
torch.manual_seed(42)

# Set device (use MPS for Apple Silicon if available, else CPU)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [13]:
# Load dataset from local disk if exists, otherwise download and sample
local_path = "./orca_dpo_sample_1000"

if os.path.exists(local_path):
    print("Loading dataset from local disk...")
    dataset = load_from_disk(local_path)
else:
    print("Downloading dataset from HuggingFace...")
    full_dataset = load_dataset("Intel/orca_dpo_pairs", split="train")
    sampled_dataset = full_dataset.shuffle(seed=42).select(range(1000))
    train_size = int(0.8 * len(sampled_dataset))
    val_size = int(0.1 * len(sampled_dataset))
    train_dataset = sampled_dataset.select(range(train_size))
    val_dataset = sampled_dataset.select(range(train_size, train_size + val_size))
    test_dataset = sampled_dataset.select(range(train_size + val_size, len(sampled_dataset)))
    dataset = DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset
    })
    dataset.save_to_disk(local_path)
    print(f"Dataset saved locally at: {local_path}")

print(f"Train size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"Test size: {len(dataset['test'])}")
print("\nExample sample:")
print(dataset["train"][0])

Loading dataset from local disk...
Train size: 800
Validation size: 100
Test size: 100

Example sample:
{'system': 'You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.', 'question': 'This is some data: CBS PLAY-BY-PLAY Chris Schenkel (first half) and Ray Scott (second half); 1962 NETWORK CBS.\n\nGenerate a detailed description of this data', 'chosen': "Okay, imagine you are watching a fun game on TV with your family. In this case, the game happened in 1962. Now, on TV, there are people who talk to us and tell us what is happening in the game. They help us understand the game better, just like how I'm helping you understand things right now.\n\nIn this data, there are two people who talked about the game in 1962. The first person, Chris Schenkel, talked about the game in the first half. The second person, Ray Scott, talked about the game in the second half. Both of them worked for a big TV company called CBS. So, this sentence is

# Load Model

In [14]:
# Load GPT-2 model and tokenizer
model_name = "openai-community/gpt2-medium"
policy_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device.type == "mps" else torch.float32,
    device_map="auto"
)
reference_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device.type == "mps" else torch.float32,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [15]:
# Preprocess data: format and tokenize
def format_dpo_data(example):
    # Map 'question' to 'prompt' and include 'system' in the prompt for context
    prompt = f"{example['system']}\n\n{example['question']}"
    return {
        "prompt": prompt,
        "chosen": example["chosen"],
        "rejected": example["rejected"]
    }

In [16]:
def tokenize_data(example):
    prompt_ids = tokenizer(example["prompt"],padding=True, truncation=True, max_length=256)["input_ids"]
    chosen_ids = tokenizer(example["chosen"], padding=True, truncation=True, max_length=512)["input_ids"]
    rejected_ids = tokenizer(example["rejected"], padding=True, truncation=True, max_length=512)["input_ids"]
    return {
        "prompt_ids": prompt_ids,
        "chosen_ids": chosen_ids,
        "rejected_ids": rejected_ids
    }


In [17]:
dataset = dataset.map(format_dpo_data)
print(f"first training example formatted: {dataset['train'][0]}")

dataset = dataset.map(tokenize_data)
print(f"first training example tokenized: {dataset['train'][0]}")

dataset.set_format(type="torch", columns=["prompt_ids", "chosen_ids", "rejected_ids"])
print(f"first training example torch: {dataset['train'][0]}")


first training example formatted: {'system': 'You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.', 'question': 'This is some data: CBS PLAY-BY-PLAY Chris Schenkel (first half) and Ray Scott (second half); 1962 NETWORK CBS.\n\nGenerate a detailed description of this data', 'chosen': "Okay, imagine you are watching a fun game on TV with your family. In this case, the game happened in 1962. Now, on TV, there are people who talk to us and tell us what is happening in the game. They help us understand the game better, just like how I'm helping you understand things right now.\n\nIn this data, there are two people who talked about the game in 1962. The first person, Chris Schenkel, talked about the game in the first half. The second person, Ray Scott, talked about the game in the second half. Both of them worked for a big TV company called CBS. So, this sentence is just telling us who talked about the game on TV and when they did it.

Map: 100%|██████████| 100/100 [00:00<00:00, 1044.27 examples/s]

first training example tokenized: {'system': 'You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.', 'question': 'This is some data: CBS PLAY-BY-PLAY Chris Schenkel (first half) and Ray Scott (second half); 1962 NETWORK CBS.\n\nGenerate a detailed description of this data', 'chosen': "Okay, imagine you are watching a fun game on TV with your family. In this case, the game happened in 1962. Now, on TV, there are people who talk to us and tell us what is happening in the game. They help us understand the game better, just like how I'm helping you understand things right now.\n\nIn this data, there are two people who talked about the game in 1962. The first person, Chris Schenkel, talked about the game in the first half. The second person, Ray Scott, talked about the game in the second half. Both of them worked for a big TV company called CBS. So, this sentence is just telling us who talked about the game on TV and when they did it.




In [18]:
# Custom collate function to ensure consistent tensor shapes
def collate_fn(batch):
    prompt_ids = [item["prompt_ids"] for item in batch]
    chosen_ids = [item["chosen_ids"] for item in batch]
    rejected_ids = [item["rejected_ids"] for item in batch]

    # 注意：你需要提前传入 tokenizer 实例
    prompt_ids = tokenizer.pad({"input_ids": prompt_ids}, padding=True, return_tensors="pt")["input_ids"]
    chosen_ids = tokenizer.pad({"input_ids": chosen_ids}, padding=True, return_tensors="pt")["input_ids"]
    rejected_ids = tokenizer.pad({"input_ids": rejected_ids}, padding=True, return_tensors="pt")["input_ids"]

    return {
        "prompt_ids": prompt_ids,
        "chosen_ids": chosen_ids,
        "rejected_ids": rejected_ids
    }


In [19]:
# Create DataLoader with custom collate function
train_loader = DataLoader(dataset["train"], batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(dataset["validation"], batch_size=2, shuffle=False, collate_fn=collate_fn)

DPO loss

In [20]:
# Define DPO loss computation for a single batch
def compute_dpo_loss_batch(batch, policy_model, reference_model, beta):
    # Move input tensors to device
    prompt_ids = batch["prompt_ids"].to(device)
    chosen_ids = batch["chosen_ids"].to(device)
    rejected_ids = batch["rejected_ids"].to(device)

    # Compute attention masks
    chosen_mask = (chosen_ids != tokenizer.pad_token_id).to(device)
    rejected_mask = (rejected_ids != tokenizer.pad_token_id).to(device)

    # Compute logits for chosen and rejected responses
    with torch.no_grad():
        ref_chosen_outputs = reference_model(chosen_ids, attention_mask=chosen_mask)
        ref_rejected_outputs = reference_model(rejected_ids, attention_mask=rejected_mask)
    with amp.autocast(device_type="mps" if device.type == "mps" else "cpu"):
        policy_chosen_outputs = policy_model(chosen_ids, attention_mask=chosen_mask)
        policy_rejected_outputs = policy_model(rejected_ids, attention_mask=rejected_mask)

    # Compute log probabilities
    chosen_logprobs = -F.log_softmax(policy_chosen_outputs.logits, dim=-1).mean(dim=-1)
    rejected_logprobs = -F.log_softmax(policy_rejected_outputs.logits, dim=-1).mean(dim=-1)
    ref_chosen_logprobs = -F.log_softmax(ref_chosen_outputs.logits, dim=-1).mean(dim=-1)
    ref_rejected_logprobs = -F.log_softmax(ref_rejected_outputs.logits, dim=-1).mean(dim=-1)

    # Compute DPO loss
    log_ratio_chosen = chosen_logprobs - ref_chosen_logprobs
    log_ratio_rejected = rejected_logprobs - ref_rejected_logprobs
    logits = beta * (log_ratio_chosen - log_ratio_rejected)
    loss = -F.logsigmoid(logits).mean()

    # Compute rewards for monitoring
    chosen_rewards = beta * log_ratio_chosen
    rejected_rewards = beta * log_ratio_rejected

    return loss, chosen_rewards.mean(), rejected_rewards.mean()

In [21]:
#Define DPO loss computation for entire DataLoader
def compute_dpo_loss_loader(data_loader, policy_model, reference_model, beta, num_batches=None):
    # Initialize accumulators
    total_loss, total_chosen_rewards, total_rejected_rewards = 0., 0., 0.
    if len(data_loader) == 0:
        return float("nan"), float("nan"), float("nan")

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    # Set model to evaluation mode
    policy_model.eval()
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            if i >= num_batches:
                break
            loss, chosen_rewards, rejected_rewards = compute_dpo_loss_batch(
                batch=batch,
                policy_model=policy_model,
                reference_model=reference_model,
                beta=beta
            )
            total_loss += loss.item()
            total_chosen_rewards += chosen_rewards.item()
            total_rejected_rewards += rejected_rewards.item()

    # Compute averages
    total_loss /= num_batches
    total_chosen_rewards /= num_batches
    total_rejected_rewards /= num_batches
    return total_loss, total_chosen_rewards, total_rejected_rewards

# Train loop

In [22]:
# Initialize optimizer
optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5)
num_epochs = 3
beta = 0.1

for epoch in range(num_epochs):
    # Set model to training mode
    policy_model.train()
    total_loss, total_chosen_rewards, total_rejected_rewards = 0., 0., 0.
    for batch in train_loader:
        optimizer.zero_grad()
        with amp.autocast(device_type="mps" if device.type == "mps" else "cpu"):
            loss, chosen_rewards, rejected_rewards = compute_dpo_loss_batch(
                batch=batch,
                policy_model=policy_model,
                reference_model=reference_model,
                beta=beta
            )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_chosen_rewards += chosen_rewards.item()
        total_rejected_rewards += rejected_rewards.item()

    # Compute average metrics
    avg_loss = total_loss / len(train_loader)
    avg_chosen_rewards = total_chosen_rewards / len(train_loader)
    avg_rejected_rewards = total_rejected_rewards / len(train_loader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_loss:.4f}, Chosen Rewards: {avg_chosen_rewards:.4f}, Rejected Rewards: {avg_rejected_rewards:.4f}")

    # Validation
    val_loss, val_chosen_rewards, val_rejected_rewards = compute_dpo_loss_loader(
        val_loader, policy_model, reference_model, beta
    )
    print(f"Validation Loss: {val_loss:.4f}, Chosen Rewards: {val_chosen_rewards:.4f}, Rejected Rewards: {val_rejected_rewards:.4f}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: The size of tensor a (225) must match the size of tensor b (185) at non-singleton dimension 1

In [23]:
import torch
print(torch.__version__)

2.7.0


# Save model and test

In [None]:
# Save model and tokenizer
policy_model.save_pretrained("./gpt2_medium_dpo_final")
tokenizer.save_pretrained("./gpt2_medium_dpo_final")

In [None]:
# Test set evaluation
test_loader = DataLoader(dataset["test"], batch_size=2, shuffle=False)
test_loss, test_chosen_rewards, test_rejected_rewards = compute_dpo_loss_loader(
    test_loader, policy_model, reference_model, beta
)
print(f"Test Loss: {test_loss:.4f}, Chosen Rewards: {test_chosen_rewards:.4f}, Rejected Rewards: {test_rejected_rewards:.4f}")

In [None]:
# Example inference
policy_model.eval()
prompt = dataset["test"][0]["prompt"]
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = policy_model.generate(**inputs, max_length=512)
print("\nExample generation:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))