In [3]:
%pip uninstall datasets -y
%pip install datasets>=3.0.0
!pip install torchdata==0.6.0
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    peft==0.3.0 --quiet

# Installing the Reinforcement Learning library directly from github.
%pip install git+https://github.com/lvwerra/trl.git@25fa1bd 
%pip install huggingface_hub[hf_xet]
%pip install openai

Found existing installation: datasets 2.11.0
Uninstalling datasets-2.11.0:
  Successfully uninstalled datasets-2.11.0
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Load a Pretrained Summarization Model

In [43]:
SFT_MODEL_PATH = "KookyGhost/GPT2-small-summarization"
model = AutoModelForCausalLM.from_pretrained(SFT_MODEL_PATH).to("cuda")  # or "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id 

### Turn Dataset into the Appropriate Format

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer

def build_dataset(model_name,
                  dataset_name,
                  input_min_text_length, 
                  input_max_text_length):
    """
    Preprocess the dataset and return train/valid/test splits with input_ids.

    Parameters:
    - model_name (str): Name or path of the tokenizer/model.
    - dataset_name (str): Name of the Hugging Face dataset.
    - input_min_text_length (int): Minimum character length of prompt.
    - input_max_text_length (int): Maximum character length of prompt.

    Returns:
    - dataset (datasets.DatasetDict): Tokenized dataset with train/valid/test splits.
    """

    # Load all splits
    dataset = load_dataset(dataset_name)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess(split):
        # Filter by character length of prompt
        split = split.filter(
            lambda x: input_min_text_length < len(x["prompt"]) <= input_max_text_length,
            batched=False
        )

        def tokenize(sample):
            prompt = f"{sample['prompt']}\n\n"
            inputs = tokenizer(prompt, truncation=True, max_length=1024)

            sample["input_ids"] = inputs["input_ids"]
            sample["query"] = tokenizer.decode(inputs["input_ids"], skip_special_tokens=True)
            return sample

        split = split.map(tokenize, batched=False)
        split.set_format(type="torch")
        return split

    dataset["train"] = preprocess(dataset["train"])
    dataset["valid"] = preprocess(dataset["valid"])
    dataset["test"]  = preprocess(dataset["test"])

    return dataset


In [6]:
dataset = build_dataset(
    model_name="gpt2",
    dataset_name="CarperAI/openai_summarize_tldr",
    input_min_text_length=200,
    input_max_text_length=1000
)

Map: 100%|██████████| 1246/1246 [00:00<00:00, 1250.77 examples/s]


In [7]:
# dataset = load_dataset("CarperAI/openai_summarize_tldr") 
dataset['train']

Dataset({
    features: ['prompt', 'label', 'input_ids', 'query'],
    num_rows: 22252
})

In [44]:
# train_prompts

### Check Trainable Parameters and Make Sure Reference Model is Not Trainable

In [8]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [40]:
# ! git clone https://huggingface.co/z7ye/peft-dialogue-summary-checkpoint

In [9]:
from trl import AutoModelForCausalLMWithValueHead
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(SFT_MODEL_PATH,                                                               
                                                               torch_dtype=torch.bfloat16,
                                                               is_trainable=True)

print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)

PPO model parameters to be updated (ValueHead + 769 params):

trainable model parameters: 124440577
all model parameters: 124440577
percentage of trainable model parameters: 100.00%

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=768, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [10]:
ref_model = create_reference_model(ppo_model)
print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

Reference model parameters to be updated:

trainable model parameters: 0
all model parameters: 124440577
percentage of trainable model parameters: 0.00%



### Load Reward Model from OpenAssistant

In [11]:
rw_model_name = "OpenAssistant/reward-model-deberta-v3-large-v2"
rw_tokenizer = AutoTokenizer.from_pretrained(rw_model_name)
rw_model = AutoModelForSequenceClassification.from_pretrained(rw_model_name)
rw_model.to("cuda")
print(rw_model.config.id2label)



{0: 'LABEL_0'}


### Define Reward Evaluation Function for PPO

In [34]:
def evaluate_normalized_reward_score(model, 
                                     reward_model, 
                                     reward_tokenizer, 
                                     dataset, 
                                     tokenizer, 
                                     num_samples):
    """
    Generate responses from the model and normalize their reward by reference label score.
    """
    max_new_tokens = 100
    rewards = []

    reward_model.eval()

    for i, sample in tqdm(enumerate(dataset), total=min(num_samples, len(dataset))):
        if i >= num_samples:
            break

        input_text = sample["query"]
        reference_summary = sample["label"]

        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

        generation_config = GenerationConfig(
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_k=0,
            top_p=1.0,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

        # Generate model output
        response_token_ids = model.generate(input_ids=input_ids, generation_config=generation_config)
        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)

        # Prepare full inputs
        full_gen_input = f"{input_text}\n{generated_text}"
        full_ref_input = f"{input_text}\n{reference_summary}"

        # Tokenize both
        gen_inputs = rw_tokenizer(full_gen_input, return_tensors="pt", truncation=True, max_length=512).to("cuda")
        ref_inputs = rw_tokenizer(full_ref_input, return_tensors="pt", truncation=True, max_length=512).to("cuda")

        with torch.no_grad():
            gen_logits = reward_model(**gen_inputs).logits
            ref_logits = reward_model(**ref_inputs).logits

            gen_reward = sigmoid(gen_logits).item()
            ref_reward = sigmoid(ref_logits).item()

            if ref_reward == 0:
                normalized_reward = 0.0
            else:
                normalized_reward = gen_reward /ref_reward  # or: gen_reward - ref_reward

        rewards.append(normalized_reward)

    return np.mean(rewards), np.std(rewards)


In [59]:
mean, std = evaluate_normalized_reward_score(
    model, rw_model, rw_tokenizer, dataset["test"], tokenizer, num_samples=100
)
print(f"Mean normalized reward: {mean:.4f} ± {std:.4f}")


100%|██████████| 100/100 [00:35<00:00,  2.82it/s]

Mean normalized reward: -0.0335 ± 0.0497





### Define PPO Configuration and Start Training

In [13]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

test_data = [{"key1": "value1", "key2": "value2", "key3": "value3"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

Collator input: [{'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}]
Collator output: {'key1': ['value1'], 'key2': ['value2'], 'key3': ['value3']}


In [14]:
learning_rate=1.41e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=16

config = PPOConfig(
    model_name="gpt2",    
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size,
)

ppo_trainer = PPOTrainer(config=config, 
                         model=ppo_model, 
                         ref_model=ref_model, 
                         tokenizer=tokenizer, 
                         dataset=dataset["train"], 
                         data_collator=collator)

In [35]:
import torch
from torch.nn.functional import sigmoid
from tqdm import tqdm

output_min_length = 100
output_max_length = 512
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

max_ppo_steps = 12

# Make sure reward_model and reward_tokenizer are loaded
# e.g.
# reward_model = AutoModelForSequenceClassification.from_pretrained(...).to("cuda")
# reward_tokenizer = AutoTokenizer.from_pretrained(...)

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    if step >= max_ppo_steps:
        break

    prompt_tensors = batch["input_ids"]
    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()
        generation_kwargs["max_new_tokens"] = max_new_tokens

        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)
        summary_tensors.append(summary.squeeze()[-max_new_tokens:])

    batch["response"] = [tokenizer.decode(r, skip_special_tokens=True) for r in summary_tensors]

    # --- Compute rewards using reward model ---
    reward_texts = [q + "\n" + r for q, r in zip(batch["query"], batch["response"])]

    reward_tensors = []
    for text in reward_texts:
        inputs = rw_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to('cuda')
        with torch.no_grad():
            logits = rw_model(**inputs).logits
            reward = sigmoid(logits).item()
        reward_tensors.append(torch.tensor(reward))

    # --- PPO update ---
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)

    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print("-" * 80)


1it [00:05,  5.12s/it]

objective/kl: -68.79667663574219
ppo/returns/mean: 3.9242851734161377
ppo/policy/advantages_mean: -1.270785610074654e-08
--------------------------------------------------------------------------------


2it [00:10,  5.07s/it]

objective/kl: -60.34709167480469
ppo/returns/mean: 3.8933277130126953
ppo/policy/advantages_mean: 9.148468649300412e-09
--------------------------------------------------------------------------------


3it [00:14,  4.95s/it]

objective/kl: -69.88340759277344
ppo/returns/mean: 4.242911338806152
ppo/policy/advantages_mean: -4.601772740642218e-09
--------------------------------------------------------------------------------


4it [00:20,  5.07s/it]

objective/kl: -72.64474487304688
ppo/returns/mean: 4.386337757110596
ppo/policy/advantages_mean: 2.00761807178651e-08
--------------------------------------------------------------------------------


5it [00:24,  4.85s/it]

objective/kl: -65.13738250732422
ppo/returns/mean: 4.216862678527832
ppo/policy/advantages_mean: 1.7648416061888383e-08
--------------------------------------------------------------------------------


6it [00:29,  5.00s/it]

objective/kl: -60.092926025390625
ppo/returns/mean: 4.051173210144043
ppo/policy/advantages_mean: -7.311508287699553e-09
--------------------------------------------------------------------------------


7it [00:35,  5.02s/it]

objective/kl: -69.19317626953125
ppo/returns/mean: 4.194902420043945
ppo/policy/advantages_mean: -2.8518353900608417e-08
--------------------------------------------------------------------------------


8it [00:39,  5.00s/it]

objective/kl: -70.78736114501953
ppo/returns/mean: 4.426096439361572
ppo/policy/advantages_mean: -9.977169312946899e-09
--------------------------------------------------------------------------------


9it [00:44,  4.97s/it]

objective/kl: -71.92752075195312
ppo/returns/mean: 4.503462314605713
ppo/policy/advantages_mean: 9.902186626220555e-09
--------------------------------------------------------------------------------


10it [00:49,  4.91s/it]

objective/kl: -68.01797485351562
ppo/returns/mean: 4.629611015319824
ppo/policy/advantages_mean: 7.552273473265814e-09
--------------------------------------------------------------------------------


11it [00:54,  4.74s/it]

objective/kl: -66.93280029296875
ppo/returns/mean: 4.590643882751465
ppo/policy/advantages_mean: 2.4668612041978122e-08
--------------------------------------------------------------------------------


12it [00:59,  4.92s/it]

objective/kl: -71.49988555908203
ppo/returns/mean: 4.566149711608887
ppo/policy/advantages_mean: -5.170743833105007e-09
--------------------------------------------------------------------------------





### Save Model

In [63]:
ppo_trainer.model.save_pretrained("ppo_model")
ppo_trainer.tokenizer.save_pretrained("ppo_model")

('ppo_model/tokenizer_config.json',
 'ppo_model/special_tokens_map.json',
 'ppo_model/vocab.json',
 'ppo_model/merges.txt',
 'ppo_model/added_tokens.json',
 'ppo_model/tokenizer.json')

### Evaluate Reward Score after Training

In [18]:
evaluate_reward_score(ppo_model, 
                      rw_model, 
                      rw_tokenizer, 
                      dataset['test'], 
                      tokenizer, 
                      num_samples=20)

100%|██████████| 20/20 [00:05<00:00,  3.59it/s]


(np.float64(0.25533432024531066), np.float64(0.13090977081572575))

### Compare Summary Generated Before and After Training

In [45]:
print("ref_model device:", next(ref_model.parameters()).device)
print("ppo_model device:", next(ppo_model.parameters()).device)


ref_model device: cuda:0
ppo_model device: cpu


In [20]:
ppo_model = ppo_model.to("cuda")
ref_model = ref_model.to("cuda")

In [36]:
from tqdm import tqdm
import torch
from torch.nn.functional import sigmoid

batch_size = 20
compare_results = {}

df_batch = dataset["test"][:batch_size]
compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]

# Get devices for each model
device_ref = next(ref_model.parameters()).device
device_ppo = next(ppo_model.parameters()).device
device_reward = next(rw_model.parameters()).device

summary_tensors_ref = []
summary_tensors = []

# Generate responses from reference model and PPO model
for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len

    input_tensor_ref = torch.tensor(prompt_tensors[i], device=device_ref).unsqueeze(0)
    input_tensor_ppo = torch.tensor(prompt_tensors[i], device=device_ppo).unsqueeze(0)

    summary_ref = ref_model.generate(input_ids=input_tensor_ref, **generation_kwargs).squeeze()[-gen_len:]
    summary_ppo = ppo_model.generate(input_ids=input_tensor_ppo, **generation_kwargs).squeeze()[-gen_len:]

    summary_tensors_ref.append(summary_ref)
    summary_tensors.append(summary_ppo)

# Decode responses
compare_results["response_before"] = [tokenizer.decode(s.cpu(), skip_special_tokens=True) for s in summary_tensors_ref]
compare_results["response_after"]  = [tokenizer.decode(s.cpu(), skip_special_tokens=True) for s in summary_tensors]

# Compute reward scores using your reward model
texts_before = [q + "\n" + r for q, r in zip(compare_results["query"], compare_results["response_before"])]
texts_after  = [q + "\n" + r for q, r in zip(compare_results["query"], compare_results["response_after"])]

def get_rewards(texts):
    rewards = []
    for text in texts:
        inputs = rw_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device_reward)
        with torch.no_grad():
            logits = rw_model(**inputs).logits
            reward = sigmoid(logits).item()
        rewards.append(reward)
    return rewards

compare_results["reward_before"] = get_rewards(texts_before)
compare_results["reward_after"] = get_rewards(texts_after)


  input_tensor_ref = torch.tensor(prompt_tensors[i], device=device_ref).unsqueeze(0)
  input_tensor_ppo = torch.tensor(prompt_tensors[i], device=device_ppo).unsqueeze(0)
100%|██████████| 20/20 [00:08<00:00,  2.22it/s]


In [45]:
compare_results["response_before"][0]

"SUBREDDIT: r/relationships\nTITLE: The girl [26 F] I [22 M] have been seeing for a month didn't respond to me at all yesterday while hanging out with a friend [~30? M].\nPOST: She gets terrible service while at her house, but I texted her 3 times yesterday, 4-5 hours apart. She didn't call me until early this morning and left a voicemail that she was busy all day with a friend who showed up out of the blue.\n\nI saw that she posted a picture of the two of them out of her dead zone house on facebook before I texted her the last time.\n\nI don't mind that she hangs out with friends, and I know it's pretty early in the relationship, but am I wrong to be a little annoyed that she didn't respond until 24 hours after my first text?\nTL;DR: \n\n for some reason, ok, I texted her after I got a lot of texts late, she didn't respond even had I called on my final date. Thoughts?"

In [46]:
compare_results["response_after"][0]

"SUBREDDIT: r/relationships\nTITLE: The girl [26 F] I [22 M] have been seeing for a month didn't respond to me at all yesterday while hanging out with a friend [~30? M].\nPOST: She gets terrible service while at her house, but I texted her 3 times yesterday, 4-5 hours apart. She didn't call me until early this morning and left a voicemail that she was busy all day with a friend who showed up out of the blue.\n\nI saw that she posted a picture of the two of them out of her dead zone house on facebook before I texted her the last time.\n\nI don't mind that she hangs out with friends, and I know it's pretty early in the relationship, but am I wrong to be a little annoyed that she didn't respond until 24 hours after my first text?\nTL;DR: \n\n I saw her hanging out with a girl that she wasn't at. Now read this."

### Let a More Intelligent Model (GPT-4o) be the Judge 

In [22]:
from openai import OpenAI

#client = OpenAI(api_key='your api key")

In [23]:
def gpt4_judge(query, response_a, response_b, client, model="gpt-4o"):
    system_prompt = (
        "You are a helpful and fair assistant. You will be shown a prompt and two responses. "
        "Your job is to judge which response is better, based only on helpfulness, accuracy, and coherence."
    )

    user_prompt = f"""### Summarize this Reddit Post:
{query}

### Response A:
{response_a}

### Response B:
{response_b}

Which response is better? Reply with 'A' or 'B' and explain briefly why."""

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.0
    )

    return response.choices[0].message.content


In [37]:
judgments = []

for q, r_before, r_after in zip(compare_results["query"],
                                compare_results["response_before"],
                                compare_results["response_after"]):
    
    result = gpt4_judge(q, r_before, r_after, client)
    judgments.append(result)
    print(result)
    print("\n-----------")# optional

Response A is better. Although it is not perfectly coherent, it attempts to summarize the situation by mentioning the lack of response and the timing of the texts. Response B, on the other hand, is incoherent and does not provide a meaningful summary or address the main points of the original post.

-----------
B

Response B is better because it attempts to summarize the original post, even though it is not entirely coherent. It captures the essence of the situation: the discovery of the wife's affair and the husband's current dilemma. Response A, on the other hand, introduces unrelated and confusing information about an ex-husband, which is not relevant to the original post.

-----------
B is the better response. It accurately reflects the original post's concern about whether not engaging in sexual activities is a bad thing, without introducing unrelated or unclear elements like "proselydering relationship" found in A. Response B is more coherent and directly addresses the poster's q

In [31]:
def extract_preference_from_text(judgment):
    judgment = judgment.lower()
    if "response a is better" in judgment:
        return "A"
    elif "response b is better" in judgment:
        return "B"
    else:
        return "Neither"

In [41]:
preferred = [extract_preference_from_text(j) for j in judgments[:100]]

ppo_wins = preferred.count("B")
print(f"GPT-4o preferred PPO model in {ppo_wins} out of {len(preferred)} comparisons.")

GPT-4o preferred PPO model in 12 out of 20 comparisons.


In [39]:
from collections import Counter
print(Counter(preferred)) 

Counter({'B': 12, 'A': 6, 'Neither': 2})
