### Install Libraries

In [None]:
!pip install -q 'numpy<2' --force-reinstall --no-cache-dir
!pip install transformers==4.39.1
!pip install trl==0.8.6
!pip install datasets==2.18.0
!pip install peft==0.10.0
!pip install torch==2.6.0
!pip install tqdm==4.66.2

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
import numpy as np
print("numpy version:", np.__version__)

numpy version: 1.26.4


In [None]:
import torch
import transformers
import datasets
import trl

print("torch version:", torch.__version__)
print("transformers version:", transformers.__version__)
print("datasets version:", datasets.__version__)
print("trl version:", trl.__version__)

torch version: 2.6.0+cu124
transformers version: 4.39.1
datasets version: 2.18.0
trl version: 0.8.6


In [None]:
import os
import json
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline, AutoTokenizer, GPT2Model
from tqdm import tqdm

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, AutoModelForCausalLM
from trl.core import LengthSampler

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


### Prompt Dataset Creation with IMDB dataset

In [None]:
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load the IMDB dataset
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    # Only choose reviews with more than 200 tokens
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        # From each review just keep the first `input_size` tokens, this represents the prompt used to generate the response
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

### Preference Dataset Creation Using TULU dataset

In [None]:
# Load the anthropic_hh split from the TULU dataset
ds_preference = load_dataset("allenai/preference-datasets-tulu", split="anthropic_hh")
preference_limited = ds_preference.select(range(5000))

preference_dataset = []

for sample in preference_limited:
    chosen = sample["chosen"]
    rejected = sample["rejected"]

    # Safely extract the first user prompt and assistant responses
    prompt = next((msg["content"] for msg in chosen if msg["role"] == "user"), None)
    chosen_response = next((msg["content"] for msg in chosen if msg["role"] == "assistant"), None)
    rejected_response = next((msg["content"] for msg in rejected if msg["role"] == "assistant"), None)

    # Only add to dataset if all parts exist
    if prompt and chosen_response and rejected_response:
        preference_dataset.append({
            "prompt": prompt,
            "candidate_0": chosen_response,
            "candidate_1": rejected_response,
            "choice": 0  # candidate_0 (chosen) is preferred
        })

print(f"Loaded {len(preference_dataset)} valid preference samples")

In [None]:
print(f"preference_dataset[0] is: {preference_dataset[0]}")

preference_dataset[0] is: {'prompt': 'Why did cells originally combine together to create life?', 'candidate_0': 'Because their simple components -- chemicals -- interacted in particular ways.  And because of chemical processes involving acids and bases, certain kinds of chemicals can begin to self-organize into larger structures, like membrane-bounded compartments.  And it’s from those compartments that life eventually emerged.', 'candidate_1': 'Cells combine because they benefit from cooperation, since they can have less competition for resources by working together.', 'choice': 0}


### Preference dataset is converted into comparison dataset

In [None]:
comparison_dataset = []

for item in preference_dataset:
    prompt = item["prompt"]
    choice = item["choice"]

    winning = item[f"candidate_{choice}"]
    losing = item[f"candidate_{1 - choice}"]

    comparison_dataset.append({
        "prompt": prompt,
        "winning_response": winning,
        "losing_response": losing
    })

Define Reward Model and Dataset

In [None]:
class RewardModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2")
        self.head = nn.Linear(self.gpt2.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        output = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = output.last_hidden_state[:, -1, :]
        reward = self.head(last_hidden)
        return reward

In [None]:
class RewardDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        item = self.data[idx]
        return item["prompt"], item["winning_response"], item["losing_response"]

    def __len__(self):
        return len(self.data)

Train Reward Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

custom_reward_model = RewardModel().to(device)

train_dataset = RewardDataset(comparison_dataset)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

In [None]:
optimizer = torch.optim.AdamW(custom_reward_model.parameters(), lr=2e-5)

for epoch in range(6):
    custom_reward_model.train()
    total_loss = 0

    for prompts, winners, losers in tqdm(train_loader):
        # Combine and tokenize batch: prompts + responses (winning + losing)
        texts_winning = [f"{prompt} {winning}" for prompt, winning in zip(prompts, winners)]
        texts_losing = [f"{prompt} {losing}" for prompt, losing in zip(prompts, losers)]

        # Tokenize all together and split
        batch_inputs = tokenizer(texts_winning + texts_losing, return_tensors="pt", padding=True, truncation=True).to(device)
        half = len(prompts)

        # Forward pass in a single call
        rewards = custom_reward_model(**batch_inputs).squeeze()

        reward_winning = rewards[:half]
        reward_losing = rewards[half:]

        # Contrastive loss
        loss = -F.logsigmoid(reward_winning - reward_losing).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}\n")

 39%|███▉      | 983/2500 [59:56<1:06:13,  2.62s/it]

Save the Trained Reward Model

In [None]:
# Save model weights
torch.save(custom_reward_model.state_dict(), "reward_model.pt")

# Save tokenizer
tokenizer.save_pretrained("reward_tokenizer/")

Load the custom reward models and Tokenizer

In [None]:
def load_reward_model():
    # Load the custom trained reward model
    reward_model = RewardModel()
    reward_model.load_state_dict(torch.load("reward_model.pt", map_location=device))
    reward_model.to(device).eval()

    # Load the tokenizer
    reward_tokenizer = AutoTokenizer.from_pretrained("reward_tokenizer/")
    reward_tokenizer.pad_token = reward_tokenizer.eos_token

    return reward_model, reward_tokenizer

In [None]:
# Load once and reuse
reward_model, reward_tokenizer = load_reward_model()

Test the custom trained reward model

In [None]:
def evaluate_reward(prompt, good_response, bad_response, model, tokenizer, device):
    model.to(device).eval()

    def get_score(text):
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            return model(**inputs).squeeze().item()

    reward_good = get_score(f"{prompt} {good_response}")
    reward_bad = get_score(f"{prompt} {bad_response}")

    print(f"Reward for GOOD response: {reward_good:.4f}")
    print(f"Reward for BAD response : {reward_bad:.4f}")

In [None]:
#Example usage
evaluate_reward(
    prompt="What is the purpose of photosynthesis?",
    good_response="Photosynthesis is the process by which plants convert sunlight into energy.",
    bad_response="Photosynthesis is how animals breathe underwater.",
    model=reward_model,
    tokenizer=reward_tokenizer,
    device=device
)

### Load and test the pre-trained reward model

In [None]:
# Load the pre-trained reward model from hugging face
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
sent_kwargs = {"top_k": None, "function_to_apply": "none", "batch_size": 16}

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Test the pre-trained reward model on custom responses

In [None]:
# Print some examples of sentiments generated by the pretrained reward model
text = "this movie was really bad!!"
print(sentiment_pipe(text, **sent_kwargs))

text = "this movie was really good!!"
print(sentiment_pipe(text, **sent_kwargs))

[{'label': 'NEGATIVE', 'score': 2.3350484371185303}, {'label': 'POSITIVE', 'score': -2.726576328277588}]
[{'label': 'POSITIVE', 'score': 2.557039976119995}, {'label': 'NEGATIVE', 'score': -2.294790029525757}]


 ### Setup PPO config

In [None]:
# Setup PPO config
config = PPOConfig(
    model_name="lvwerra/gpt2-imdb",
    learning_rate=1e-6,
    batch_size=16,
    mini_batch_size=4,
    gradient_accumulation_steps=4,
    target_kl=0.1,         # softly guide KL to stay near this
    kl_penalty="abs"       # or "kl" (absolute or proportional penalty)
)

prompt_dataset = build_dataset(config)

# Load tokenizer and models
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name).to(device)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name).to(device)




In [None]:
print(f"shape of the dataset is: {prompt_dataset.shape}")

shape of the dataset is: (24895, 4)


In [None]:
print(f"dataset[0] is: {prompt_dataset[0]}")

dataset[0] is: {'review': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes a

Define PPO Trainer (with Sentiment Reward)

In [None]:
# Initialize PPO trainer
ppo_trainer = PPOTrainer(
    config=config,
    model=model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=prompt_dataset,
    data_collator=collator
)

 Create PPOTrainer Loop with pre-trained sentimental reward model

In [None]:
length_sampler = LengthSampler(4, 16)

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader), desc="PPO Training"):
    query_tensors = batch["input_ids"]

    response_tensors = []
    for query in query_tensors:
        query_str = tokenizer.decode(query)
        gen_len = length_sampler()
        generation_args = {
            "max_new_tokens": gen_len,
            "top_k": 50,
            "top_p": 0.95,
            "do_sample": True,
            "pad_token_id": tokenizer.eos_token_id,
        }
        response = ppo_trainer.generate(query, **generation_args)
        response_flat = response.squeeze(0) if response.dim() == 2 else response
        response_tensors.append(response_flat)

    batch["response"] = [tokenizer.decode(r) for r in response_tensors]
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]

    # Convert sentiment outputs to probabilities
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = []
    for out in pipe_outputs:
        logits = torch.tensor([out[0]["score"], out[1]["score"]])
        prob = F.softmax(logits, dim=0)[1].item()
        rewards.append(torch.tensor(prob))

    # Run PPO update
    try:
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        kl_value = stats.get('kl', None)
        reward_mean = torch.stack(rewards).mean().item()

        print(f"Epoch {epoch}: Reward Mean = {reward_mean:.4f}, KL = {kl_value:.4f}")

        # Optionally skip if KL is too negative or unstable
        if kl_value is not None and kl_value < 0:
            print(f"Negative KL detected: {kl_value:.4f}, skipping batch.")
            continue

        ppo_trainer.log_stats(stats, batch, rewards)

    except Exception as e:
        print(f"PPO step error at epoch {epoch}: {e}")
        continue

In [None]:
config = PPOConfig(
        model_name="lvwerra/gpt2-imdb",
        learning_rate=1.41e-5,
        log_with="wandb",
    )

# This is the model we are going to fine-tune with PPO
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
# This is the reference model (frozen) for the KL divergence
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)

tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=prompt_dataset, data_collator=collator)

device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"

In [None]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)

# The configuration to generate responses (trajectories)
response_generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Phase 1: Get trajectories from the offline policy
    # In this case we are only generating the responses, but not computing the log probabilities, which will be computed internally by the PPOTrainer.
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        response_generation_kwargs["max_new_tokens"] = gen_len # Number of tokens to generate (chosen randomly)
        response = ppo_trainer.generate(query, **response_generation_kwargs) # It returns the (query + response) tokens
        response_tensors.append(response.squeeze()[-gen_len:]) # Only take the tokens corresponding to the generated response (remove the prompt/query from the beginning)
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Phase 1: Compute rewards
    # Join the query (prompt) + response (generated tokens)
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    # Compute the reward for each of the texts (query + response)
    # shape: A list of dictionaries with two keys: POSITIVE and NEGATIVE. We are interested in the POSITIVE score. This will be our reward.
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)

    # The reward for each text is the score (logit) corresponding to the POSITIVE class.
    # shape: A list of scalars, one for each generated response.
    # It means we assign the reward to the whole response (not to each token).
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Phase 1 + Phase 2: calculate the logprobs and then run the PPO update
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)

    ppo_trainer.log_stats(stats, batch, rewards)


 Save the Fine-Tuned Model

In [None]:
# Save fine-tuned PPO model weights and tokenizer
ppo_trainer.model.save_pretrained("ppo-gpt2-custom-reward")
tokenizer.save_pretrained("ppo-gpt2-custom-reward")

Test the models on sample prompts

In [None]:
# Load base model (pre-fine-tune)
base_model = AutoModelForCausalLMWithValueHead.from_pretrained("lvwerra/gpt2-imdb").to(device)
base_tokenizer = AutoTokenizer.from_pretrained("lvwerra/gpt2-imdb")
base_tokenizer.pad_token = base_tokenizer.eos_token

# Load fine-tuned PPO model (with value head)
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained("ppo-gpt2-custom-reward").to(device)
ppo_tokenizer = AutoTokenizer.from_pretrained("ppo-gpt2-custom-reward")
ppo_tokenizer.pad_token = ppo_tokenizer.eos_token

In [None]:
# Sample test prompts
test_prompts = [
    "The movie was absolutely amazing because",
    "I didn't enjoy the film because",
    "The story was predictable and",
    "The acting performance was",
    "The cinematography made the film feel",
    "One thing I liked about the movie was",
    "One thing I disliked about the movie was"
]

# Evaluate both models
print("\n==== Base Model vs Fine-Tuned PPO Model ====\n")
for prompt in tqdm(test_prompts):
    # Base model generation
    input_ids = base_tokenizer(prompt, return_tensors="pt").to(device).input_ids
    base_output = base_model.generate(input_ids, max_new_tokens=40)
    base_text = base_tokenizer.decode(base_output[0], skip_special_tokens=True)

    # Fine-tuned PPO model generation
    input_ids = ppo_tokenizer(prompt, return_tensors="pt").to(device).input_ids
    ppo_output = ppo_model.generate(input_ids, max_new_tokens=40)
    ppo_text = ppo_tokenizer.decode(ppo_output[0], skip_special_tokens=True)

    # Get sentiment scores
    base_score = sentiment_pipe(base_text, **sent_kwargs)[0][1]["score"]  # POSITIVE score
    ppo_score = sentiment_pipe(ppo_text, **sent_kwargs)[0][1]["score"]

    # Print comparison
    print(f"PROMPT: {prompt}")
    print(f"Base → {base_text} | POS Score: {base_score:.4f}")
    print(f"PPO  → {ppo_text} | POS Score: {ppo_score:.4f}")
    print("-" * 60)