In [1]:
# uncomment below if you want to use customized attention module for attention weights extraction before softmax
from transformers.models.qwen3 import modeling_qwen3
from modeling_qwen_attention import Qwen3Attention_v1
modeling_qwen3.Qwen3Attention = Qwen3Attention_v1
from reward_model import OutcomeRewardModel
from torch.utils.data import Dataset, DataLoader

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

dataset = None

# Load Qwen3 tokenizer and model
model_name = "Qwen/Qwen3-0.6B"
model_path = "/home/students/wli/UniHeidelberg/semster2/final_projects/models/Qwen3-0.6B-Base"
model_path = model_name
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map = "auto", attn_implementation="eager")  # Specify eager for attention before softmax

# tokenizer.padding_side = "left"
# Make sure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [3]:
# test the behavior of LLM before fine tuing
prompt = "Sentence: a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films. Label:"
# prompts = ["test prompt how old are ", "test prompt 2 hello world"]

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# You can control max_length or early stopping
output_ids = model.generate(
    **inputs,
    max_new_tokens=10,  # adjust based on expected label length
    do_sample=False,  # greedy decoding
    pad_token_id=tokenizer.pad_token_id
)


# Decode generated tokens
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Sentence: a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films. Label: 1930s. The label is


# test model

In [4]:
orm = OutcomeRewardModel(model)

In [5]:
orm(**tokenizer(["i am a good man", "what are you doing, there are a lot of things to do "], return_tensors = 'pt', padding = True).to(orm.device))

tensor([2.4266, 3.3440], device='cuda:0', grad_fn=<SqueezeBackward1>)

In [6]:
from datasets import load_dataset

ds = load_dataset("Anthropic/hh-rlhf")

In [None]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from tqdm import tqdm
import gc

# Configuration
epoch = 1
eval_step = 1000
batch_size = 4
learning_rate = 1e-5
max_grad_norm = 1.0
gradient_accumulation_steps = 1  # Increase if you still have memory issues

# Setup optimizer
optimizer = AdamW(orm.parameters(), lr=learning_rate)

# Enable mixed precision training for memory efficiency
scaler = torch.cuda.amp.GradScaler()

def collate_fn(examples):
    """Optimized collate function"""
    chosen_input = tokenizer(
        [ex['chosen'] for ex in examples], 
        return_tensors='pt', 
        padding=True,
        truncation=True,  # Add truncation to limit sequence length
        max_length=512  # Adjust based on your needs
    )
    rejected_input = tokenizer(
        [ex['rejected'] for ex in examples], 
        return_tensors='pt', 
        padding=True,
        truncation=True,
        max_length=512
    )
    return chosen_input, rejected_input

# Training loop
step = 0
orm.train()  # Ensure model is in training mode

for epoch_idx in range(epoch):
    dataloader = DataLoader(
        ds['train'], 
        batch_size=batch_size, 
        collate_fn=collate_fn,
        pin_memory=True,  # Faster data transfer to GPU
        num_workers=2  # Parallel data loading
    )
    
    for batch_idx, (chosen_input, rejected_input) in enumerate(tqdm(dataloader, leave=True)):
        # Move inputs to device
        chosen_input = {k: v.to(orm.device) for k, v in chosen_input.items()}
        rejected_input = {k: v.to(orm.device) for k, v in rejected_input.items()}
        
        # Use mixed precision training
        with torch.cuda.amp.autocast():
            # Forward pass
            reward_chosen = orm(**chosen_input)
            reward_rejected = orm(**rejected_input)
            
            # Calculate loss
            diff = reward_chosen - reward_rejected
            loss = torch.nn.functional.softplus(-diff).mean()
            
            # Scale loss for gradient accumulation
            loss = loss / gradient_accumulation_steps
        
        # Backward pass with gradient scaling
        scaler.scale(loss).backward()
        
        # Update weights only after accumulation steps
        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            # Gradient clipping
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(orm.parameters(), max_grad_norm)
            
            # Optimizer step
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            
            step += 1
            
            # Evaluation logging
            if step % eval_step == 0:
                print(f"Step {step}, Loss: {loss.item() * gradient_accumulation_steps:.4f}")
        
        # Critical: Delete tensors and clear cache periodically
        del chosen_input, rejected_input, reward_chosen, reward_rejected, diff, loss
        
        if batch_idx % 50 == 0:  # Clear cache every 50 batches
            torch.cuda.empty_cache()
            gc.collect()

# Save model
torch.save(orm.state_dict(), "../reward_model.pt")
print("Training completed and model saved!")

# Final cleanup
torch.cuda.empty_cache()
gc.collect()

# train rm

In [7]:
epoch = 1

from transformers import DataCollatorWithPadding
from tqdm import tqdm 
import torch 
from torch.optim import AdamW
eval_step = 1000 
optimizer = AdamW(orm.parameters(), lr = 1e-5)

collator = DataCollatorWithPadding(tokenizer = tokenizer)


def collate_fn(examples):
    # print(examples)
    chosen_input = tokenizer([ex['chosen'] for ex in examples], return_tensors = 'pt', padding = True)
    rejected_input = tokenizer([ex['rejected']for ex in examples], return_tensors = 'pt', padding = True)

    return chosen_input, rejected_input

step = 0
for i in range(epoch):
    for chosen_input, rejected_input in tqdm(DataLoader(ds['train'], batch_size = 4, collate_fn = collate_fn), leave = True):
        # print(batch_data[0])
        reward_chosen = orm(**chosen_input.to(orm.device))
        reward_rejected = orm(**rejected_input.to(orm.device))

        diff = reward_chosen - reward_rejected
        loss = torch.nn.functional.softplus(-diff).mean() # negetive log sigmoid(diff)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(orm.parameters(), 1.0)
        optimizer.step()    

        step += 1
        if step % eval_step == 0:
            print(f"loss is {loss.item()}")

torch.save(orm.state_dict(), "../reward_model.pt")

  torch.nn.utils.clip_grad_norm(orm.parameters(), 1.0)
  0%|          | 39/40200 [00:23<6:50:52,  1.63it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 144.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 141.56 MiB is free. Including non-PyTorch memory, this process has 39.35 GiB memory in use. Of the allocated memory 35.37 GiB is allocated by PyTorch, and 3.48 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# test dataset 

# test best of N


In [None]:
# Cell
MAX_LEN = 200
def score(prompt, response):
    seq = (prompt + tokenizer.eos_token + response).strip()
    enc = tokenizer([seq], truncation=True, padding=True, return_tensors="pt", max_length=MAX_LEN).to(orm.device)
    with torch.no_grad():
        return orm(enc["input_ids"], enc["attention_mask"]).item()

prompt = "Q: How to boil an egg?"
cands = ["Boil 10 min then cool.","Just dunk it."]
for c in cands:
    print(score(prompt, c), c)


In [None]:
dir()

In [None]:
globals()

In [None]:
nonlocal()

# integrate trained reward model into PPO pipeline 

test the model before and after PPO

In [None]:
# Cell: PPO setup using TRL
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from trl.core import respond_to_batch
import numpy as np

# Create policy model for PPO (value head wrapper)
policy_name = 'Qwen/Qwen3-0.6B' # the same as reward model 
policy_tokenizer = AutoTokenizer.from_pretrained(policy_name)
policy_tokenizer.pad_token = policy_tokenizer.eos_token
policy = AutoModelForCausalLMWithValueHead.from_pretrained(policy_name).to(DEVICE)
ref_model = create_reference_model(policy)  # frozen reference

# PPO config (tune these)
ppo_config = PPOConfig(
    batch_size=4,
    forward_batch_size=1,
    ppo_epochs=4,
    learning_rate=1.41e-5,
    log_with=None,
    minibatch_size=1,
)

ppo_trainer = PPOTrainer(ppo_config, policy, ref_model, tokenizer=policy_tokenizer, dataset=None)  # dataset not required here

# Reward wrapper that uses our trained RM
def rm_reward_function(prompts, responses, batch_size=8):
    # prompts: list[str], responses: list[str]
    seqs = [p + policy_tokenizer.eos_token + r for p,r in zip(prompts, responses)]
    # batch scoring
    all_scores = []
    for i in range(0, len(seqs), batch_size):
        enc = tokenizer(seqs[i:i+batch_size], truncation=True, padding=True, return_tensors="pt", max_length=MAX_LEN).to(DEVICE)
        with torch.no_grad():
            sc = rm(enc["input_ids"], enc["attention_mask"]).cpu().numpy()
        all_scores.append(sc)
    all_scores = np.concatenate(all_scores, axis=0)
    # normalize per-batch (important for PPO stability)
    all_scores = (all_scores - all_scores.mean()) / (all_scores.std() + 1e-8)
    return all_scores

# Example PPO step loop (toy)
queries = ["Write a polite greeting.", "Give a short tip about productivity."]  # prompts
# generate responses with the policy (sampled)
responses = []
for q in queries:
    encoded_q = policy_tokenizer(q, return_tensors="pt").to(DEVICE)
    out = policy.generate(**encoded_q, do_sample=True, top_k=50, top_p=0.95, max_new_tokens=64)
    gen_text = policy_tokenizer.decode(out[0][encoded_q["input_ids"].shape[-1]:], skip_special_tokens=True)
    responses.append(gen_text)

# compute rewards using RM
rewards = rm_reward_function(queries, responses)
# Run a PPO step
# The TRL PPOTrainer expects tokenized tensors; the step API will accept raw texts in recent versions:
stats = ppo_trainer.step(queries, responses, rewards)
ppo_trainer.log_stats(stats, queries, rewards)
print("PPO step done, stats:", stats)
