# Does Profiling Preference Data Improve DPO Training?

**Hypothesis**: Filtering low-quality pairs (easy pairs, near-duplicates, low-contrast) from a preference dataset before DPO training yields better reward accuracy than training on the full unfiltered dataset.

**Setup**: Qwen2.5-1.5B-Instruct, LoRA (r=16), UltraFeedback 5K subset, T4 GPU (Colab free tier).

**Method**: Profile with preflight → filter flagged pairs → train DPO on full vs. filtered → compare reward accuracy + loss on held-out eval set.

In [None]:
!pip install git+https://github.com/yudduy/preflight.git -q
!pip install trl transformers datasets peft accelerate bitsandbytes -q

## 1. Load Dataset + Train/Eval Split

In [None]:
from datasets import load_dataset
import json

ds = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="train_prefs[:5000]")
print(f"Loaded {len(ds)} preference pairs")

# Hold out 200 for evaluation
ds_train = ds.select(range(200, len(ds)))
ds_eval = ds.select(range(200))
print(f"Train: {len(ds_train)}, Eval: {len(ds_eval)}")

# Save train split as JSONL for preflight
with open("ultrafeedback_train.jsonl", "w") as f:
    for row in ds_train:
        f.write(json.dumps({
            "prompt": row["prompt"],
            "chosen": row["chosen"],
            "rejected": row["rejected"],
        }) + "\n")

## 2. Run Preflight Audit

In [None]:
!preflight audit ultrafeedback_train.jsonl --output audit_report.json

import json
with open("audit_report.json") as f:
    report = json.load(f)

print("\n=== Audit Summary ===")
print(f"Samples: {report['metadata']['n_samples']}")
print(f"Length biased: {report['length_bias']['biased']} (p={report['length_bias']['p_value']:.4f})")
print(f"Low-contrast pairs: {report['embedding_similarity']['low_contrast_count']}")
print(f"Easy pairs: {report['easy_pairs']['count']}")
print(f"Exact duplicates: {report['dedup']['exact_duplicate_count']}")
print(f"Near-duplicate pairs: {report['dedup']['near_duplicate_count']}")
print()
for rec in report.get("recommendations", []):
    print(f"  → {rec}")

## 3. Filter Dataset Based on Profiler Findings

In [None]:
from datasets import Dataset

# Collect all flagged indices
remove = set()
remove.update(report.get("easy_pairs", {}).get("indices", []))

# Low-contrast pairs (similarity > 0.9)
es = report.get("embedding_similarity", {})
if es.get("low_contrast_count", 0) > 0:
    # Re-run to get per-pair data since the report only has aggregate stats
    from preflight.loader import load_dataset as pf_load
    from preflight.embeddings import analyze_embedding_similarity
    samples, _, _ = pf_load("ultrafeedback_train.jsonl")
    _, chosen_emb, rejected_emb, sims = analyze_embedding_similarity(samples)
    import numpy as np
    low_contrast_idx = np.where(sims > 0.9)[0]
    remove.update(int(i) for i in low_contrast_idx)

print(f"Flagged indices: {len(remove)} / {len(ds_train)}")
print(f"  Easy pairs: {report.get('easy_pairs', {}).get('count', 0)}")
print(f"  Low-contrast: {es.get('low_contrast_count', 0)}")
print(f"  (some may overlap)")

filtered_rows = [row for i, row in enumerate(ds_train) if i not in remove]
ds_filtered = Dataset.from_list(filtered_rows)
print(f"\nFiltered: {len(ds_filtered)} pairs (removed {len(ds_train) - len(ds_filtered)})")

## 4. DPO Training Setup

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOConfig, DPOTrainer
from peft import LoraConfig
import torch

MODEL = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

lora_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

def make_dpo_config(output_dir):
    return DPOConfig(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=5e-5,
        beta=0.1,
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_strategy="no",
        bf16=torch.cuda.is_bf16_supported(),
        fp16=not torch.cuda.is_bf16_supported(),
        report_to="none",
    )

In [None]:
# --- Train on FULL dataset ---
model_full = AutoModelForCausalLM.from_pretrained(
    MODEL, torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
)

trainer_full = DPOTrainer(
    model=model_full,
    args=make_dpo_config("./dpo_full"),
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    processing_class=tokenizer,
    peft_config=lora_config,
)

trainer_full.train()
full_logs = trainer_full.state.log_history
print(f"Full dataset training complete — {len(ds_train)} pairs")

del model_full, trainer_full
torch.cuda.empty_cache()

In [ ]:
# --- Train on FILTERED dataset ---
model_filt = AutoModelForCausalLM.from_pretrained(
    MODEL, torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
)

trainer_filt = DPOTrainer(
    model=model_filt,
    args=make_dpo_config("./dpo_filtered"),
    train_dataset=ds_filtered,
    eval_dataset=ds_eval,
    processing_class=tokenizer,
    peft_config=lora_config,
)

trainer_filt.train()
filtered_logs = trainer_filt.state.log_history
print(f"Filtered dataset training complete — {len(ds_filtered)} pairs")

del model_filt, trainer_filt
torch.cuda.empty_cache()

In [None]:
import matplotlib.pyplot as plt

def get_series(logs, key):
    return [(l["step"], l[key]) for l in logs if key in l]

def get_eval_series(logs, key):
    return [(l["step"], l[key]) for l in logs if key in l and "eval" in key]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Eval reward accuracy
for name, logs in [("Full (4800)", full_logs), ("Filtered", filtered_logs)]:
    data = get_series(logs, "eval_rewards/accuracies")
    if data:
        steps, vals = zip(*data)
        axes[0].plot(steps, vals, marker="o", label=name)
axes[0].set_title("Eval Reward Accuracy")
axes[0].set_xlabel("Step")
axes[0].set_ylabel("Accuracy")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 2. Eval loss
for name, logs in [("Full (4800)", full_logs), ("Filtered", filtered_logs)]:
    data = get_series(logs, "eval_loss")
    if data:
        steps, vals = zip(*data)
        axes[1].plot(steps, vals, marker="o", label=name)
axes[1].set_title("Eval Loss")
axes[1].set_xlabel("Step")
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# 3. Train loss
for name, logs in [("Full (4800)", full_logs), ("Filtered", filtered_logs)]:
    data = get_series(logs, "loss")
    if data:
        steps, vals = zip(*data)
        axes[2].plot(steps, vals, alpha=0.8, label=name)
axes[2].set_title("Train Loss")
axes[2].set_xlabel("Step")
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.suptitle("DPO Training: Full vs. Preflight-Filtered Dataset", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("comparison.png", dpi=150, bbox_inches="tight")
plt.show()

In [ ]:
# Final metrics summary
def last_eval(logs, key):
    vals = [l[key] for l in logs if key in l]
    return vals[-1] if vals else None

full_acc = last_eval(full_logs, "eval_rewards/accuracies")
filt_acc = last_eval(filtered_logs, "eval_rewards/accuracies")
full_loss = last_eval(full_logs, "eval_loss")
filt_loss = last_eval(filtered_logs, "eval_loss")

print("=" * 50)
print("RESULTS SUMMARY")
print("=" * 50)
print(f"{'Metric':<30} {'Full':>8} {'Filtered':>8}")
print("-" * 50)
print(f"{'Train samples':<30} {len(ds_train):>8} {len(ds_filtered):>8}")
print(f"{'Removed pairs':<30} {'—':>8} {len(ds_train) - len(ds_filtered):>8}")
if full_acc is not None:
    print(f"{'Eval reward accuracy':<30} {full_acc:>8.4f} {filt_acc:>8.4f}")
    delta = filt_acc - full_acc
    print(f"{'Δ accuracy':<30} {'':>8} {delta:>+8.4f}")
if full_loss is not None:
    print(f"{'Eval loss':<30} {full_loss:>8.4f} {filt_loss:>8.4f}")
print("=" * 50)

## Interpretation

**What this shows**: Whether removing preflight-flagged pairs (easy, low-contrast, duplicates) from UltraFeedback improves DPO training on a held-out eval set.

**What to look for**:
- Higher eval reward accuracy for filtered → profiling helps
- Lower eval loss for filtered → better generalization
- If filtered trains on fewer samples but matches or beats full → data quality > data quantity

**Limitations at this scale**:
- Single dataset (UltraFeedback), single model (1.5B), single seed
- Small held-out set (200 pairs) — eval variance is high
- 1 epoch of training — longer training may change the picture

**With more compute, extend to**:
- Multiple datasets (UltraFeedback, Nectar, HH-RLHF)
- Multiple model sizes (1.5B, 7B)
- Multiple seeds + confidence intervals
- MT-Bench / AlpacaEval generation quality eval