In [1]:
import torch
import wandb
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardConfig, RewardTrainer

  from .autonotebook import tqdm as notebook_tqdm
2025-04-08 13:41:53.917898: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744119713.936352    7191 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744119713.942486    7191 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744119713.958535    7191 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744119713.958549    7191 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744119713.958552    7191

Simulation parameters

In [2]:
config = {
    "learning_rate": 5e-5,
    "epochs": 3,
    "batch_size": 16,
    "max_length": 512
}

wandb.init(project="trl", config=config)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mangus27[0m ([33mvaler[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Load dataset

In [3]:
prepared_dataset = load_dataset("trl-lib/lm-human-preferences-descriptiveness")

Model and tokenizer

In [4]:
model_name = "distilroberta-base"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data preprocessing

In [5]:
def formatting_func(examples):
    kwargs = {"padding": "max_length", "truncation": True, "max_length": config["max_length"], "return_tensors": "pt"}

    # Prepend the prompt and a line break to the response
    prompt_plus_chosen_response = examples["prompt"] + "\n" + examples["chosen"]
    prompt_plus_rejected_response = examples["prompt"] + "\n" + examples["rejected"]

    # Then tokenize these modified fields.
    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0],
        "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }

formatted_dataset = prepared_dataset.map(formatting_func)

RewardConfig and RewardTrainer

In [6]:
reward_config = RewardConfig(
    output_dir='./reward_model',
    max_length=config["max_length"],
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    logging_dir='./logs',
    logging_steps=200,
    report_to="wandb",
)

class PatchedRewardTrainer(RewardTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

trainer = PatchedRewardTrainer(
    model=model,
    args=reward_config,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],
    tokenizer=tokenizer
)

  super().__init__(


Train the model

In [7]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
200,0.5533
400,0.4747
600,0.4276
800,0.2953




TrainOutput(global_step=936, training_loss=0.4167597925561106, metrics={'train_runtime': 305.8447, 'train_samples_per_second': 48.966, 'train_steps_per_second': 3.06, 'total_flos': 0.0, 'train_loss': 0.4167597925561106, 'epoch': 3.0})

Compute the score after training

In [8]:
model_path = './reward_model/checkpoint-936'
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

def get_score(model, tokenizer, prompt, response):
    prompt_plus_response = prompt + "\n" + response
    inputs = tokenizer.encode_plus(
        prompt_plus_response,
        truncation=True,
        padding="max_length",
        max_length=config["max_length"],
        return_tensors="pt"
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    return logits.item()

prompt = prepared_dataset["test"]["prompt"][10]
print(prompt)
example_less_pref_response = prepared_dataset["test"]["rejected"][10]
example_preferred_response = prepared_dataset["test"]["chosen"][10]

He fell on his back, his nose and lips bleeding. 
"Isn't it a beautiful sight? The destruction. Just look how it illuminated the sky. It is a real work of art," Ember said as she paced towards Auron.


Score for less preferred response

In [9]:
score = get_score(loaded_model, tokenizer, prompt, example_less_pref_response)
print(example_less_pref_response)
print(score)

 
"It's a beautiful sight, but I don't need your help.
-3.0786616802215576


Score for preferred response

In [10]:
score = get_score(loaded_model, tokenizer, prompt, example_preferred_response)
print(example_preferred_response)
print(score)

 
"Do you know what this is?" Auron asked in a trembling voice.
1.829581618309021
