# DPO Data Quality Validation Experiment
This notebook validates whether preflight's data quality profiling improves DPO training outcomes.

**Setup:** Run on Google Colab with T4/A100 GPU runtime.

In [None]:
!pip install preflight sentence-transformers trl transformers datasets peft accelerate bitsandbytes -q

## Step 1 - Load Dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="train_prefs[:5000]")
print(f"Loaded {len(ds)} preference pairs")

# Save as JSONL for preflight
import json
with open("ultrafeedback_5k.jsonl", "w") as f:
    for row in ds:
        f.write(json.dumps({
            "prompt": row["prompt"],
            "chosen": row["chosen"],
            "rejected": row["rejected"],
        }) + "\n")

## Step 2 - Run Preflight Audit

In [None]:
!preflight audit ultrafeedback_5k.jsonl --output audit_report.json

import json
with open("audit_report.json") as f:
    report = json.load(f)

print("\n=== Key Findings ===")
for rec in report.get("recommendations", []):
    print(f"  - {rec}")

## Step 3 - Create Filtered Dataset

In [None]:
import numpy as np

# Load the report to get indices to remove
easy_indices = set(report.get("easy_pairs", {}).get("indices", []))
print(f"Removing {len(easy_indices)} easy pairs")

# Create filtered dataset
filtered_rows = [row for i, row in enumerate(ds) if i not in easy_indices]
print(f"Filtered dataset: {len(filtered_rows)} pairs (from {len(ds)})")

from datasets import Dataset
ds_filtered = Dataset.from_list(filtered_rows)

## Step 4 - DPO Training (Full vs Filtered)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOConfig, DPOTrainer
from peft import LoraConfig
import torch

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# LoRA config for memory efficiency
lora_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Training config
training_args = DPOConfig(
    output_dir="./dpo_full",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    beta=0.1,
    logging_steps=10,
    save_strategy="no",
    bf16=True,
    report_to="none",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.bfloat16, device_map="auto",
)

trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    processing_class=tokenizer,
    peft_config=lora_config,
)

trainer.train()
print("Full dataset training complete")
# Save final reward accuracy
full_metrics = trainer.state.log_history

In [None]:
# Reset model
del model, trainer
torch.cuda.empty_cache()

model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.bfloat16, device_map="auto",
)

training_args.output_dir = "./dpo_filtered"

trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_filtered,
    processing_class=tokenizer,
    peft_config=lora_config,
)

trainer.train()
print("Filtered dataset training complete")
filtered_metrics = trainer.state.log_history

## Step 5 - Compare Results

In [None]:
import matplotlib.pyplot as plt

def extract_metric(logs, key):
    return [(l["step"], l[key]) for l in logs if key in l]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Reward accuracy
for name, metrics in [("Full", full_metrics), ("Filtered", filtered_metrics)]:
    data = extract_metric(metrics, "rewards/accuracies")
    if data:
        steps, vals = zip(*data)
        axes[0].plot(steps, vals, label=name)
axes[0].set_title("Reward Accuracy")
axes[0].set_xlabel("Step")
axes[0].legend()

# Loss
for name, metrics in [("Full", full_metrics), ("Filtered", filtered_metrics)]:
    data = extract_metric(metrics, "loss")
    if data:
        steps, vals = zip(*data)
        axes[1].plot(steps, vals, label=name)
axes[1].set_title("Training Loss")
axes[1].set_xlabel("Step")
axes[1].legend()

plt.tight_layout()
plt.savefig("comparison.png", dpi=150)
plt.show()

## Results

| Metric | Full Dataset | Filtered Dataset |
|--------|-------------|------------------|
| Final reward accuracy | _fill in_ | _fill in_ |
| Final loss | _fill in_ | _fill in_ |
| Training time | _fill in_ | _fill in_ |
| Dataset size | 5000 | _fill in_ |

### Key Finding
_Fill in after running: did filtering improve DPO training?_