# Data preparation for DPO training & evaluation

In [1]:
from datasets import load_dataset, DatasetDict, concatenate_datasets
import re
import numpy as np
from transformers import AutoTokenizer
import pandas as pd


In [2]:
# Load dataset (assumes argilla dataset structure)
dataset = load_dataset("argilla/ultrafeedback-binarized-preferences", split="train")

len(dataset)

63619

In [3]:
dataset.column_names

['source',
 'instruction',
 'chosen_response',
 'rejected_response',
 'chosen_avg_rating',
 'rejected_avg_rating',
 'chosen_model']

In [4]:
# Define STEM-related keywords

stem_keywords = [
    "physics", "chemistry", "biology", "math", "equation", "algebra", "calculus", "derive", "derivative",
    "integration", "integrate", "binary", "c++", "java", "python", "javascript", "neural", "network", "data",
    "algorithm", "complexity", "statistics", "probability", "molecule", "force", "energy", "mass", "quantum", 
    "code", "program", "machine learning", "AI", "pressure",
    "programming", "equation", "mathematics", "compute", "code", "coding", "differentiate", "differential",
    "matrix", "vector", "tensor", "function", "logarithm", "geometry", "angle", "triangle", "circle",
    "pi", "sine", "cosine", "loop", "variable", "recursion", "iteration",
    "class", "inheritance", "transformer", "dataset", "optimizer", "gradient", "loss function",
    "backpropagation", "rust", "typescript", "RAM", "computer"
]

# Filter only STEM-related prompts
def is_stem(example):
    text = example["instruction"].lower()
    return any(keyword in text for keyword in stem_keywords)

In [5]:
stem_dataset = dataset.filter(is_stem)

In [6]:
len(stem_dataset)

29495

In [7]:
def format_for_dpo(example):
    return {
        "prompt": example["instruction"],
        "chosen": example["chosen_response"],
        "rejected": example["rejected_response"],
        "dataset": example["source"]
    }

In [8]:


dpo_dataset = stem_dataset.map(format_for_dpo)

# Drop unneeded fields
dpo_dataset = dpo_dataset.remove_columns([col for col in dpo_dataset.column_names if col not in {"prompt", "chosen", "rejected", "dataset"}])

# Optional: save to disk or check size
print(f"Filtered dataset has {len(dpo_dataset)} examples.")
# dpo_dataset.save_to_disk("stem_dpo_dataset")

Filtered dataset has 29495 examples.


In [9]:
dpo_dataset.column_names

['prompt', 'chosen', 'rejected', 'dataset']

In [12]:
model_name = "Qwen/Qwen3-0.6B-Base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

text_columns = ['prompt', 'chosen', 'rejected']

def describe_prompt_tokens():
    for col in text_columns:
        max_tokens = max(len(tokenizer(text)["input_ids"]) for text in dpo_dataset[col])
        print(f"Max token count in column '{col}': {max_tokens} tokens")
    
    for col in text_columns:
        mean_tokens = np.mean([len(tokenizer(text)["input_ids"]) for text in dpo_dataset[col]])
        print(f"Mean token count in column '{col}': {mean_tokens} tokens")

In [68]:
token_lengths = {}
for col in text_columns:
    token_lengths[col] = [len(tokenizer(text)["input_ids"]) for text in dpo_dataset[col]]

token_lengths = pd.DataFrame(token_lengths)

In [69]:
token_lengths.describe()

Unnamed: 0,prompt,chosen,rejected
count,29495.0,29495.0,29495.0
mean,234.702967,324.410205,249.840007
std,284.670196,252.921766,229.513862
min,4.0,0.0,0.0
25%,59.0,93.0,59.0
50%,129.0,295.0,184.0
75%,300.0,503.0,382.0
max,5629.0,1339.0,1241.0


In [13]:
max_size = 1024
def is_small_enough(example):
    prompt_size = len(tokenizer(example['prompt'])["input_ids"])
    completion_max_size = max(len(tokenizer(example['rejected'])["input_ids"]), len(tokenizer(example['chosen'])["input_ids"]))
    
    return prompt_size + completion_max_size <= max_size

In [14]:
dataset_max_length = dpo_dataset.filter(is_small_enough)

In [15]:
len(dataset_max_length)

26616

In [74]:
# Initial split: train+valid (90%) and test (10%)
split_dataset = dataset_max_length.train_test_split(test_size=0.1, seed=42)

# Further split train+valid into train (89%) and valid (11%) of the original dataset
# So, validation will be ~10% of original dataset
train_valid = split_dataset["train"].train_test_split(test_size=0.1111, seed=42)

# 3️⃣ Final dataset dict
split_dataset = DatasetDict({
    "train": train_valid["train"],
    "valid": train_valid["test"],
    "test": split_dataset["test"]
})

In [75]:
print(len(split_dataset["train"]))
print(len(split_dataset["valid"]))
print(len(split_dataset["test"]))

21292
2662
2662


In [76]:
dataset_name = f"argilla-maxsize{max_size}"
split_dataset.push_to_hub(f"lindsaybordier/{dataset_name}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/627 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lindsaybordier/argilla-maxsize1024/commit/b4e78767bcc92d7240849920a25814fd87320a27', commit_message='Upload dataset', commit_description='', oid='b4e78767bcc92d7240849920a25814fd87320a27', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lindsaybordier/argilla-maxsize1024', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lindsaybordier/argilla-maxsize1024'), pr_revision=None, pr_num=None)

## Remove test data from keyword filtered

## Push Model for Milestone 2

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Current repo name on the Hugging Face Hub
old_repo = "lindsaybordier/Qwen3-0.6B-DPO_argilla_ultrafeedback-binarized-preferences_keywords-filtered"

# New repo name (what you want to push it as)
new_repo = "MNLP_M2_dpo_model"

# Load model and tokenizer from the old repo
model = AutoModelForCausalLM.from_pretrained(
    old_repo,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(old_repo)

# Push to Hub under the new name
model.push_to_hub(f"lindsaybordier/{new_repo}")
tokenizer.push_to_hub(f"lindsaybordier/{new_repo}")

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lindsaybordier/MNLP_M2_dpo_model/commit/29d7ffb5c83dc87d4c9c98224feedb235c01bf7b', commit_message='Upload tokenizer', commit_description='', oid='29d7ffb5c83dc87d4c9c98224feedb235c01bf7b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/lindsaybordier/MNLP_M2_dpo_model', endpoint='https://huggingface.co', repo_type='model', repo_id='lindsaybordier/MNLP_M2_dpo_model'), pr_revision=None, pr_num=None)

## Add math dataset

In [16]:
math_dataset = load_dataset("xinlai/Math-Step-DPO-10K", split="train")

len(math_dataset)

10795

In [17]:
math_dataset.column_names

['dataset',
 'prompt',
 'initial_reason_steps',
 'chosen',
 'rejected',
 'full_chosen',
 'full_rejected',
 'answer']

In [18]:
def prepend_initial_reasoning(example):
    reasoning = example["initial_reason_steps"]
    example["chosen"] = reasoning + " " + example["chosen"]
    example["rejected"] = reasoning + " " + example["rejected"]
    return example

In [19]:
# Apply the function to the dataset (train, valid, test splits if needed)
math_dataset = math_dataset.map(prepend_initial_reasoning)

In [20]:
# Drop unneeded fields
math_dataset = math_dataset.remove_columns([col for col in math_dataset.column_names if col not in {"prompt", "chosen", "rejected", "dataset"}])

In [21]:
math_dataset.column_names

['dataset', 'prompt', 'chosen', 'rejected']

In [22]:
def format_math_for_dpo(example):
    return {
        "prompt": example["prompt"],
        "chosen": example["chosen"],
        "rejected": example["rejected"],
        "dataset": example["dataset"]
    }

In [23]:
math_dataset.map(format_math_for_dpo)

Dataset({
    features: ['dataset', 'prompt', 'chosen', 'rejected'],
    num_rows: 10795
})

In [24]:
math_dataset.column_names

['dataset', 'prompt', 'chosen', 'rejected']

In [65]:
math_token_lengths = {}
text_columns = ['prompt', 'chosen', 'rejected']
for col in text_columns:
    math_token_lengths[col] = [len(tokenizer(text)["input_ids"]) for text in math_dataset[col]]

math_token_lengths = pd.DataFrame(math_token_lengths)

In [66]:
math_token_lengths.describe()

Unnamed: 0,prompt,chosen,rejected
count,10795.0,10795.0,10795.0
mean,50.83057,202.054284,200.486799
std,29.075922,131.600274,137.512208
min,7.0,19.0,21.0
25%,33.0,109.0,106.0
50%,47.0,173.0,171.0
75%,62.0,261.0,257.0
max,717.0,1799.0,1763.0


In [25]:
math_dataset_max_length = math_dataset.filter(is_small_enough)

In [26]:
print(len(math_dataset_max_length))
print(math_dataset_max_length.column_names)

10733
['dataset', 'prompt', 'chosen', 'rejected']


## Add programming dataset

In [27]:
prog_dataset = load_dataset("Vezora/Code-Preference-Pairs", split="train")

len(prog_dataset)

54024

In [28]:
dataset.column_names

['source',
 'instruction',
 'chosen_response',
 'rejected_response',
 'chosen_avg_rating',
 'rejected_avg_rating',
 'chosen_model']

In [29]:
def format_prog_for_dpo(example):
    return {
        "prompt": example["input"],
        "chosen": example["accepted"],
        "rejected": example["rejected"],
        "dataset": "Code-Preference-Pairs"
    }

In [30]:
prog_dataset = prog_dataset.map(format_prog_for_dpo)

In [32]:
# Drop unneeded fields
prog_dataset = prog_dataset.remove_columns([col for col in prog_dataset.column_names if col not in {"prompt", "chosen", "rejected", "dataset"}])

In [33]:
prog_dataset_max_length = prog_dataset.filter(is_small_enough)

Filter:   0%|          | 0/54024 [00:00<?, ? examples/s]

In [34]:
print(len(prog_dataset_max_length))
print(prog_dataset_max_length.column_names)

52115
['rejected', 'prompt', 'chosen', 'dataset']


In [35]:
# 1️⃣ Shuffle each dataset
dataset_max_length = dataset_max_length.shuffle(seed=42)
math_dataset_max_length = math_dataset_max_length.shuffle(seed=42)
prog_dataset_max_length = prog_dataset_max_length.shuffle(seed=42)

# 2️⃣ Take 10k samples from each
dataset_max_length = dataset_max_length.select(range(10000))
math_dataset_max_length = math_dataset_max_length.select(range(10000))
prog_dataset_max_length = prog_dataset_max_length.select(range(10000))

# 3️⃣ Merge them
merged_dataset = concatenate_datasets([dataset_max_length, math_dataset_max_length, prog_dataset_max_length])

# 4️⃣ Shuffle the merged dataset
merged_dataset = merged_dataset.shuffle(seed=42)

# 5️⃣ Split into train/valid/test (e.g., 80/10/10 split)
split_dataset = merged_dataset.train_test_split(test_size=0.2, seed=42)
valid_test_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)

final_dataset = DatasetDict({
    "train": split_dataset["train"],
    "valid": valid_test_split["train"],
    "test": valid_test_split["test"]
})

In [36]:
final_dataset.column_names

{'train': ['prompt', 'chosen', 'rejected', 'dataset'],
 'valid': ['prompt', 'chosen', 'rejected', 'dataset'],
 'test': ['prompt', 'chosen', 'rejected', 'dataset']}

In [37]:
final_dataset_name = f"dpo_final_dataset_{max_size}"
final_dataset.push_to_hub(f"lindsaybordier/{final_dataset_name}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/773 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lindsaybordier/dpo_final_dataset_1024/commit/6f507f2b20bd90da2db8db54a8be040e50be3f6f', commit_message='Upload dataset', commit_description='', oid='6f507f2b20bd90da2db8db54a8be040e50be3f6f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lindsaybordier/dpo_final_dataset_1024', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lindsaybordier/dpo_final_dataset_1024'), pr_revision=None, pr_num=None)

3000

## Push model for Milestone 3

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Current repo name on the Hugging Face Hub
old_repo = "lindsaybordier/Qwen3-0.6B-DPO_argilla_ultrafeedback-binarized-preferences_keywords-filtered"

# New repo name (what you want to push it as)
new_repo = "MNLP_M2_dpo_model"

# Load model and tokenizer from the old repo
model = AutoModelForCausalLM.from_pretrained(
    old_repo,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(old_repo)

# Push to Hub under the new name
model.push_to_hub(f"lindsaybordier/{new_repo}")
tokenizer.push_to_hub(f"lindsaybordier/{new_repo}")