# Reward Model & PPO Training

**RLHF Phase:** Train reward model and optimize with PPO

In [None]:
!pip install -q transformers datasets peft trl accelerate wandb

In [None]:
import sys; sys.path.append('../')
import torch, json, os
from src.data.loader import DatasetLoader
from src.data.processor import DataProcessor
from src.data.generator import PreferenceGenerator
from src.model.base import load_base_model, load_tokenizer
from src.model.reward import RewardModel
from src.training.reward import RewardModelTrainer
from src.training.ppo import PPOTrainer
from src.utils.metrics import MetricsTracker

# Optional wandb - no API key required
try:
    import wandb
    wandb.init(project='persona-chatbot-rlhf', name='reward-ppo', mode='disabled')  # offline mode
    USE_WANDB = True
except:
    USE_WANDB = False
    class wandb:
        @staticmethod
        def log(*args, **kwargs): pass
        @staticmethod
        def finish(): pass

print(f'W&B: {"enabled (offline)" if USE_WANDB else "disabled"}')
print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"}')

In [None]:
# Config
reward_config = {'num_labels': 1, 'lora_r': 8, 'lora_alpha': 16}
reward_train_config = {'output_dir': '../models/reward', 'num_epochs': 1, 'per_device_batch_size': 2, 'gradient_accumulation_steps': 8, 'learning_rate': 1e-5, 'warmup_steps': 100}
ppo_config = {'model_name': 'gpt2-medium', 'tokenizer_name': 'gpt2-medium', 'output_dir': '../models/rlhf', 'total_steps': 5000, 'batch_size': 8, 'learning_rate': 1.5e-5, 'ppo_epochs': 4, 'kl_coef': 0.2, 'clip_range': 0.2, 'vf_coef': 0.1, 'gamma': 1.0, 'lam': 0.95, 'max_new_tokens': 150, 'temperature': 0.9, 'save_freq': 500}

## 1. Load SFT Model

In [None]:
# Load SFT model from notebook 3
sft_path = '../models/sft/final'
model = load_base_model({'name': sft_path, 'device_map': 'auto'})
tokenizer = load_tokenizer({'name': sft_path})
print(f'✅ SFT model loaded from {sft_path}')

## 2. Generate Preference Pairs

In [None]:
# Load data and generate preference pairs
loader = DatasetLoader()
train = loader.load_personachat(split='train', use_synthetic=True)
processor = DataProcessor(config={'base_model': 'gpt2-medium', 'max_length': 512})
processed = processor.preprocess(train[:5000])  # Use subset
generator = PreferenceGenerator(config={})
pairs = generator.generate_pairs(processed, model=model)
print(f'Generated {len(pairs)} preference pairs')

## 3. Train Reward Model

In [None]:
# Initialize and train reward model
reward_model = RewardModel(base_model='gpt2-medium', config=reward_config)
print('Training reward model...')
reward_trainer = RewardModelTrainer(reward_model, pairs[:4000], pairs[4000:], reward_train_config)
reward_results = reward_trainer.train()
reward_model.save(f"{reward_train_config['output_dir']}/final")
print(f'✅ Reward model trained')

## 4. Prepare for PPO

In [None]:
# Extract prompts and create reference model
prompts = generator.extract_prompts(processed)
print(f'Prepared {len(prompts)} prompts for PPO')

# Create frozen reference model
ref_model = load_base_model({'name': sft_path, 'device_map': 'auto'})
ref_model.eval()
for param in ref_model.parameters():
    param.requires_grad = False
print('✅ Reference model created')

## 5. PPO Training

In [None]:
# Initialize PPO trainer (PASS TOKENIZER!)
os.makedirs(ppo_config['output_dir'], exist_ok=True)
ppo_trainer = PPOTrainer(policy_model=model, reward_model=reward_model, ref_model=ref_model, config=ppo_config, tokenizer=tokenizer)
print('🚀 Starting PPO training...')
ppo_results = ppo_trainer.train(prompts)
print(f'✅ PPO complete! Final reward: {ppo_results["final_reward"]:.4f}')
wandb.log({'final_ppo_reward': ppo_results['final_reward']})

In [None]:
wandb.finish()
print('✅ Complete! Next: 5_evaluation.ipynb')