In [5]:
import os,sys
import torch
import torch.nn as nn
import transformers
import datasets
import pandas as pd

In [2]:
ds = datasets.load_dataset("SoftAge-AI/rlhf-ranking_dataset")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 502/502 [00:00<00:00, 18117.48 examples/s]


In [71]:
df = pd.DataFrame(ds['train'])
df = df[df['Preference']!='Tied']
df = df[df['Preference']!='FLAG']
df = df[df['Preference']!='Flag']
df = df[df['Preference']!='Tie']
len(df)

345

In [72]:
def prepare_dataset(row):
    r1 = row['Response 1']
    r2 = row['Response 2']
    pref = row['Preference']
    row['instruction']=row['Prompt']
    if pref=='Response 1':
        row['chosen_response'] = r1
        row['rejected_response'] = r2
    elif pref=='Response 2':
        row['chosen_response'] = r2
        row['rejected_response'] = r1
    else:
        print(pref)
    return row
df_rlhf = df[['Prompt','Response 1','Response 2','Preference']]
df_rlhf['chosen_response']=None
df_rlhf['rejected_response']=None
df_rlhf['instruction']=None
df_rlhf = df_rlhf.apply(prepare_dataset,axis=1).drop(index=0)
df_rlhf.reset_index(drop=True,inplace=True)
df_rlhf = df_rlhf[['instruction','chosen_response','rejected_response']]

None


In [74]:
df_rlhf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   instruction        344 non-null    object
 1   chosen_response    344 non-null    object
 2   rejected_response  344 non-null    object
dtypes: object(3)
memory usage: 8.2+ KB


# Reward modelling

In [84]:
import random
import pandas as pd
from operator import itemgetter
import torch
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
import warnings
warnings.filterwarnings('ignore')
from datasets import Dataset, load_dataset
from transformers import AutoModelForSequenceClassification,AutoTokenizer,TrainingArguments
from trl import RewardTrainer,RewardConfig

cuda


In [64]:
#Model
model_name = "distilroberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
def formatting_func(examples):
    kwargs = {"padding": "max_length", "truncation": True, "max_length": 512, "return_tensors": "pt"}
    prompt_plus_chosen_response = examples["instruction"] + "\n" + examples["chosen_response"]
    prompt_plus_rejected_response = examples["instruction"] + "\n" + examples["rejected_response"]
    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)
    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }
rlhf_dataset = Dataset.from_pandas(df_rlhf)
formatted_dataset = rlhf_dataset.map(formatting_func)
formatted_dataset = formatted_dataset.train_test_split()

Map: 100%|██████████| 344/344 [00:00<00:00, 842.00 examples/s] 


In [None]:
# Configuring the training arguments
training_args = TrainingArguments(
    output_dir="./reward_model",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    logging_steps=1,
    num_train_epochs = 10,
    report_to=None,
)
config = RewardConfig(
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    evaluation_strategy="steps",
    gradient_accumulation_steps=4,
    output_dir="reward_model",
    logging_steps=20,
)
# Loading the RewardTrainer from TRL
trainer = RewardTrainer(
    model=model,
    args=config,
    processing_class=tokenizer,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],
)
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
20,0.2634,0.738462,0.670588
40,0.185,0.776742,0.658824
60,0.149,0.771021,0.729412
80,0.1073,0.899277,0.670588
