In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [3]:
from typing import List, Tuple

## IMBD

In [5]:
from datasets import load_dataset

def proc(row):
    l = row['label']
    prompt = f"""Question: Does the following movie review have positive or negative in sentiment?

Review: {row['text']}

Answer: The sentiment of the above review is"""
    # 0 is negative, 1 is positive
    chosen = 'positive' if l == 1 else 'negative'
    rejected = 'positive' if l == 0 else 'negative'

    return {
        'prompt': prompt,
        'chosen': chosen,
        'rejected': rejected,
    }

def load_imdb_dpo(N=None):
    dataset = load_dataset("stanfordnlp/imdb", keep_in_memory=False)

    # the dataset is not shuffled by default so
    dataset = dataset.shuffle(seeds=42)

    return dataset.map(proc).select_columns(['prompt', 'chosen', 'rejected'])

dataset = load_imdb_dpo()
dataset

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 50000
    })
})

In [6]:
dataset.push_to_hub('wassname/imdb_dpo')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/wassname/imdb_dpo/commit/516f2ef052d27fbe9ed5b919da840b3292b07823', commit_message='Upload dataset', commit_description='', oid='516f2ef052d27fbe9ed5b919da840b3292b07823', pr_url=None, pr_revision=None, pr_num=None)

## TruthfulQA

In [7]:
from datasets import load_dataset

def load_tqa_dpo(N=None):

    slice = '' if N is None else f'[:{N}]' # https://huggingface.co/docs/datasets/en/loading#slice-splits
    dataset_tqab = load_dataset("EleutherAI/truthful_qa_binary", keep_in_memory=False)
    def proc(row):
        l = row['label']
        return {
            'prompt': row['question'],
            'chosen': row['choices'][l],
            'rejected': row['choices'][~l],
        }
    return dataset_tqab.map(proc).select_columns(['prompt', 'chosen', 'rejected'])


In [8]:
dataset2 = load_tqa_dpo()
dataset2

DatasetDict({
    validation: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 817
    })
})

In [9]:
dataset2.push_to_hub('wassname/truthful_qa_dpo')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/wassname/truthful_qa_dpo/commit/4381bb21536b09d205600064876da6734f7b4e50', commit_message='Upload dataset', commit_description='', oid='4381bb21536b09d205600064876da6734f7b4e50', pr_url=None, pr_revision=None, pr_num=None)