# How to train a reward model?

In [4]:
from torch import optim
from torch.utils.data import DataLoader

import pytorch_lightning as pl
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

from instruct_goose.reward import RewardModel, PairwiseLoss
from instruct_goose.dataset import PairDataset

**Step 1**: Create a reward model from a pre-trained language model

In [5]:
model_base = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [6]:
reward_model = RewardModel(checkpoint="gpt2")

**Step 2**: Create a Pairwise dataset

In [7]:
dataset = load_dataset("CarperAI/openai_summarize_comparisons")

Using custom data configuration CarperAI--openai_summarize_comparisons-e658a7d8b35ec187
Found cached dataset parquet (/Users/education/.cache/huggingface/datasets/CarperAI___parquet/CarperAI--openai_summarize_comparisons-e658a7d8b35ec187/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
pair_dataset = PairDataset(dataset["train"], tokenizer)

100%|██████████| 92534/92534 [00:50<00:00, 1815.22it/s]


In [9]:
dataloader = DataLoader(pair_dataset, batch_size=2)

**Step 3**: Write a training loop

In [10]:
pairwise_loss = PairwiseLoss()

In [11]:
class LitRewardModel(pl.LightningModule):
    def __init__(
        self, model, loss_func, lr
    ):
        super().__init__()
        self.model = model
        self.loss_func = loss_func
        self.lr = lr
    
    def training_step(self, batch, batch_idx: int):
        chosen_input_ids, chosen_attention_mask,\
        rejected_input_ids, rejected_attention_mask = batch
        
        chosen_rewards = self.model(chosen_input_ids, chosen_attention_mask)
        rejected_rewards = self.model(rejected_input_ids, rejected_attention_mask)
        
        loss = self.loss_func(chosen_rewards, rejected_rewards)
        
        print(f"loss={loss}")
        
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        return optimizer

In [12]:
lit_model = LitRewardModel(reward_model, pairwise_loss, lr=1e-3)

In [12]:
trainer = pl.Trainer(max_epochs=1, log_every_n_steps=1, )

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


In [14]:
trainer.fit(model=lit_model, train_dataloaders=dataloader)


  | Name      | Type         | Params
-------------------------------------------
0 | model     | RewardModel  | 124 M 
1 | loss_func | PairwiseLoss | 0     
-------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.762   Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]