In [1]:
import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer
from trl import AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

tqdm.pandas()
from transformers import pipeline
from trl import PPOConfig, PPOTrainer

  from .autonotebook import tqdm as notebook_tqdm
2025-04-08 11:08:46.338050: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744110526.356342    4951 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744110526.362389    4951 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744110526.378264    4951 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744110526.378283    4951 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744110526.378285    4951

Reward model and PPO configuration

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

reward_pipe = pipeline("text-classification",
                       model="argilla/roberta-base-reward-model-falcon-dolly",
                       tokenizer="argilla/roberta-base-reward-model-falcon-dolly",
                       truncation=True, device=device)

config = PPOConfig(
    model_name="gpt2",
    learning_rate=1.41e-5,
    log_with="wandb"
)

Device set to use cuda:0


Report to wandb

In [3]:
import wandb

wandb.init()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mangus27[0m ([33mvaler[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Model and reference model

In [4]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name).to(device)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

Load dataset

In [5]:
dataset = load_dataset("argilla/databricks-dolly-15k-curated-en", split="train")

Preprocess dataset

In [6]:
template = """\
### Instruction: {instruction}

### Context: {context}

### Response: {response}"""

def formatting_func(sample):
    instruction = sample["new-instruction"]
    context = sample["new-context"]

    if instruction["status"][0] == "submitted":
        text = template.format(
            instruction=instruction["value"][0],
            context=context["value"][0][:500],
            response=""
        ).strip()
        return {
            "text": text,
            "input_text": text
        }
    return None

formatted_dataset = dataset.map(formatting_func, load_from_cache_file=False).filter(lambda x: x is not None)

def tokenize_func(example):
    input_text = example["text"]
    kwargs = {
        "padding": "max_length",
        "truncation": True,
        "max_length": 50,
        "return_tensors": "pt"
    }
    data = tokenizer(input_text, **kwargs)
    return {
        "input_ids": data["input_ids"].squeeze(),
        "query": tokenizer.decode(data["input_ids"].squeeze())
    }


tokenized_dataset = formatted_dataset.map(tokenize_func, load_from_cache_file=False)
tokenized_dataset.set_format(type="torch")

Map: 100%|██████████| 15015/15015 [00:02<00:00, 5553.51 examples/s]
Map: 100%|██████████| 15015/15015 [00:09<00:00, 1555.92 examples/s]


Define generation params and PPOTrainer

In [7]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=tokenized_dataset, data_collator=collator)



Training loop

In [8]:
for epoch, batch in enumerate(tqdm(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs).squeeze()
        response_len = len(response) - len(query)
        response_tensors.append(response[-response_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = reward_pipe(texts)
    rewards = [torch.tensor(output["score"]) for output in pipe_outputs]

    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 117/117 [24:34<00:00, 12.60s/it]


Save model and tokenizer

In [9]:
model.save_pretrained("gpt2-dolly")
tokenizer.save_pretrained("gpt2-dolly")

('gpt2-dolly/tokenizer_config.json',
 'gpt2-dolly/special_tokens_map.json',
 'gpt2-dolly/vocab.json',
 'gpt2-dolly/merges.txt',
 'gpt2-dolly/added_tokens.json',
 'gpt2-dolly/tokenizer.json')

Generate response with the reference model

In [10]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

inputs = template.format(
    instruction="Is a toad a frog?",
    context="Both frogs and toads are amphibians in the order Anura, which means \"without a tail.\" Toads are a sub-classification of frogs, meaning that all toads are frogs, but not all frogs are toads.",
    response=""
).strip()
encoding = tokenizer([inputs], return_tensors="pt").to(device)
outputs = ref_model.generate(**encoding, max_new_tokens=10)
output_text = tokenizer.decode(outputs[0])
print("*** REFERENCE MODEL ***")
print(output_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


*** REFERENCE MODEL ***
### Instruction: Is a toad a frog?

### Context: Both frogs and toads are amphibians in the order Anura, which means "without a tail." Toads are a sub-classification of frogs, meaning that all toads are frogs, but not all frogs are toads.

### Response: The frog is a frog.

### Context


Generate response with the new model

In [11]:
model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2-dolly").to(device)
tokenizer = AutoTokenizer.from_pretrained("gpt2-dolly")
tokenizer.pad_token = tokenizer.eos_token

inputs = template.format(
    instruction="Is a toad a frog?",
    context="Both frogs and toads are amphibians in the order Anura, which means \"without a tail.\" Toads are a sub-classification of frogs, meaning that all toads are frogs, but not all frogs are toads.",
    response=""
).strip()
encoding = tokenizer([inputs], return_tensors="pt").to(device)
outputs = model.generate(**encoding, max_new_tokens=10)
output_text = tokenizer.decode(outputs[0])
print("*** MODEL AFTER PPO FINE-TUNING ***")
print(output_text)

Some weights of the model checkpoint at gpt2-dolly were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


*** MODEL AFTER PPO FINE-TUNING ***
### Instruction: Is a toad a frog?

### Context: Both frogs and toads are amphibians in the order Anura, which means "without a tail." Toads are a sub-classification of frogs, meaning that all toads are frogs, but not all frogs are toads.

### Response: The toad is a small, slender frog that
