# Imports

In [None]:
from datasets import load_dataset, DatasetDict
import os
import json

# Load Raw Dataset

In [2]:
def load_raw_dataset(dataset_name: str = "argilla/ultrafeedback-binarized-preferences-cleaned"):
    return load_dataset(dataset_name)

# Split Dataset

In [3]:
def split_dataset(raw_ds, train_frac: float = 0.8, eval_frac: float = 0.1, test_frac: float = 0.1, seed: int = 42) -> DatasetDict:
    # The dataset has only 'train', so split that
    train_ds, temp_ds = raw_ds["train"].train_test_split(train_size=train_frac, seed=seed).values()
    relative_eval = eval_frac / (eval_frac + test_frac)
    eval_ds, test_ds = temp_ds.train_test_split(train_size=relative_eval, seed=seed).values()
    return DatasetDict({"train": train_ds, "eval": eval_ds, "test": test_ds})

# Process Splits for DPO Alignment

In [5]:
def process_split(split_ds) -> list:
    SYSTEM_PROMPT = "You are a helpful AI assistant."
    processed = []
    for ex in split_ds:
        prompt = ex["prompt"]
        chosen = ex["chosen"][1]['content']
        rejected = ex["rejected"][1]['content']
        processed.append({"prompt": prompt, "chosen": chosen, "rejected": rejected})
    return processed

# Save Processed Splits to Disk

In [6]:
def save_splits(processed_splits: dict, output_dir: str = "./dpo_data") -> None:
    os.makedirs(output_dir, exist_ok=True)
    for split_name, records in processed_splits.items():
        path = os.path.join(output_dir, f"{split_name}.jsonl")
        with open(path, "w", encoding="utf-8") as f:
            for rec in records:
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        print(f"Saved {len(records)} examples to {path}")

# Load Processed Splits from Disk

In [7]:
def load_for_dpo(data_dir: str = "./dpo_processed_data") -> DatasetDict:
    files = {
        "train": os.path.join(data_dir, "train.jsonl"),
        "eval": os.path.join(data_dir, "eval.jsonl"),
        "test": os.path.join(data_dir, "test.jsonl")
    }
    return load_dataset("json", data_files=files)

# Observe Data

In [8]:
raw_ds = load_raw_dataset()
raw_ds

DatasetDict({
    train: Dataset({
        features: ['source', 'prompt', 'chosen', 'chosen-rating', 'chosen-model', 'rejected', 'rejected-rating', 'rejected-model'],
        num_rows: 60917
    })
})

In [9]:
raw_ds['train'][0]['prompt']

'Can you write a C++ program that prompts the user to enter the name of a country and checks if it borders the Mediterranean Sea? Here\'s some starter code to help you out:\n#include <iostream>\n#include <string>\nusing namespace std;\nint main() {\n    string country;\n    // prompt user for input\n    cout << "Enter the name of a country: ";\n    cin >> country;\n    // check if country borders the Mediterranean Sea\n    // [C++ code]\n    return 0;\n}'

In [10]:
raw_ds['train'][0]['chosen'][1]

{'content': 'Here\'s a C++ program that prompts the user to enter the name of a country and checks if it borders the Mediterranean Sea:\n\n#include <iostream>\n#include <string>\n#include <set>\n#include <map>\n#include <algorithm>\n\nusing namespace std;\n\nint main() {\n    // store countries and their bordering seas in a map\n    map<string, set<string>> countries;\n    countries["Algeria"] = {"Mediterranean Sea", "North African Coast"};\n    countries["France"] = {"Mediterranean Sea", "English Channel"};\n    countries["Italy"] = {"Mediterranean Sea", "Adriatic Sea"};\n    countries["Malta"] = {"Mediterranean Sea", "North African Coast"};\n    countries["Spain"] = {"Mediterranean Sea", "Atlantic Ocean"};\n    countries["Tunisia"] = {"Mediterranean Sea", "North African Coast"};\n\n    string country;\n    cout << "Enter the name of a country: ";\n    cin >> country;\n\n    // check if country is in the map\n    if (countries.find(country)!= countries.end()) {\n        // check if co

In [11]:
raw_ds['train'][0]['rejected'][1]

{'content': 'Sure, here is the program using the C++11 algorithm "cds::algorithm::GreaterEqual":\n#include <iostream>\n#include <string>\n#include <algorithm>\n#include <vector>\n#include <cctype>\n\nusing namespace std;\n\nint main() {\n    string country;\n    cout << "Enter the name of a country: ";\n    cin >> country;\n    std::vector<string> vec;\n    vec.push_back(country);\n    size_t index = std::find_if(vec.begin(), vec.end(), [](const string& s) {\n        return std::any_of(s.begin(), s.end(), [](const char& c) {\n            return c == \'}}\';\n    });\n    if (index != vec.end()) {\n        if ((*index)[0] == \'M\') {\n            cout << "Country is bordered by the Mediterranean Sea." << endl;\n        } else {\n            cout << "Country does not border the Mediterranean Sea." << endl;\n        }\n    } else {\n        cout << "Country is not found." << endl;\n    }\n    return 0;\n}',
 'role': 'assistant'}

In [12]:
splits = split_dataset(raw_ds)
splits

DatasetDict({
    train: Dataset({
        features: ['source', 'prompt', 'chosen', 'chosen-rating', 'chosen-model', 'rejected', 'rejected-rating', 'rejected-model'],
        num_rows: 48733
    })
    eval: Dataset({
        features: ['source', 'prompt', 'chosen', 'chosen-rating', 'chosen-model', 'rejected', 'rejected-rating', 'rejected-model'],
        num_rows: 6092
    })
    test: Dataset({
        features: ['source', 'prompt', 'chosen', 'chosen-rating', 'chosen-model', 'rejected', 'rejected-rating', 'rejected-model'],
        num_rows: 6092
    })
})

In [13]:
splits = {name: process_split(ds) for name, ds in splits.items()}
len(splits)

3

In [14]:
len(splits['train'])

48733

In [15]:
save_splits(splits, output_dir="./dpo_processed_data")

Saved 48733 examples to ./dpo_processed_data/train.jsonl
Saved 6092 examples to ./dpo_processed_data/eval.jsonl
Saved 6092 examples to ./dpo_processed_data/test.jsonl


In [16]:
dpo_ds = load_for_dpo(data_dir='dpo_processed_data')
dpo_ds

Generating train split: 48733 examples [00:00, 213636.08 examples/s]
Generating eval split: 6092 examples [00:00, 201970.56 examples/s]
Generating test split: 6092 examples [00:00, 152419.16 examples/s]


DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 48733
    })
    eval: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 6092
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 6092
    })
})

In [17]:
dpo_ds['train']

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 48733
})

In [18]:
dpo_ds['train'][0]

{'prompt': 'Write a short story set in the 1920s, featuring a hardboiled detective tasked with solving a grisly murder at a theater festival in New York City. Incorporate clues within the setting and dialogue, and keep the tone mysterious and suspenseful throughout the narrative.',
 'chosen': 'It was a sweltering summer evening in New York City, the summer of 1920. A theater festival was taking place in the city, showcasing plays and musicals from all over the country. The festival was a hub of creativity and excitement, with crowds of people pouring in every night to catch the latest performance.\nBut amidst the glamour and excitement, a grisly murder occurred. A young actress was found dead backstage, her body battered and bruised beyond recognition. The police were called and the theater was quickly shut down, as detectives set out to solve the gruesome crime.\nEnter our hardboiled detective, Jack Ryan, a grizzled and no-nonsense investigator with a penchant for solving the toughest

In [19]:
dpo_ds['train'][0]['prompt']

'Write a short story set in the 1920s, featuring a hardboiled detective tasked with solving a grisly murder at a theater festival in New York City. Incorporate clues within the setting and dialogue, and keep the tone mysterious and suspenseful throughout the narrative.'

In [20]:
dpo_ds['train'][0]['chosen']

'It was a sweltering summer evening in New York City, the summer of 1920. A theater festival was taking place in the city, showcasing plays and musicals from all over the country. The festival was a hub of creativity and excitement, with crowds of people pouring in every night to catch the latest performance.\nBut amidst the glamour and excitement, a grisly murder occurred. A young actress was found dead backstage, her body battered and bruised beyond recognition. The police were called and the theater was quickly shut down, as detectives set out to solve the gruesome crime.\nEnter our hardboiled detective, Jack Ryan, a grizzled and no-nonsense investigator with a penchant for solving the toughest cases. With his trusty detective agency by his side, Ryan was tasked with getting to the bottom of the murder.\nRyan arrived at the theater just as the police were finishing up their investigation. The backstage area was a chaotic mess, with actors and production crew members milling about, l

In [21]:
dpo_ds['train'][0]['rejected']

"The sound of the busy theater district could be heard in the distance, as the detective made their way through the throngs of people. The air was thick with the scent of popcorn and excitement as the crowds made their way to the theater festival. But as the detective approached, their ears were met with the sound of a woman's piercing scream. As they weaved through the sea of people, they discovered the source of the scream - a dead body lying in the middle of the popcorn-covered floor. The detective had a job cut out for them, as the clues were scarce but pointed towards the possibility of a larger conspiracy. With their sharp wit and deductive reasoning, the detective set out to solve the crime and uncover the truth. But as darkness crept upon the festival, the mystery seemed to deepen, and the detective's resolve was put to the test. Who was the murderer, and what was their motive? Was it all part of a larger plot? The detective needed to act fast in order to crack the case before 

In [None]:
print(dpo_ds["train"][3]["prompt"])
print(dpo_ds["train"][3]["prompt"][0].keys())
print(dpo_ds["train"][3]["prompt"][1].keys())
print('\n')
print(dpo_ds["train"][3]["chosen"])
print(dpo_ds["train"][3]["chosen"].keys())
print('\n')
print(dpo_ds["train"][3]["rejected"])
print(dpo_ds["train"][3]["rejected"].keys())
print('\n')