# RL-CAI Training Notebook

This notebook implements the Reinforcement Learning stage of Constitutional AI (RL-CAI).

Steps:
1. Generate preference data using SL-CAI models
2. Train reward models
3. Fine-tune with PPO using constitutional feedback

**Important**: Run the cells in order, starting with Section 0!

## 0. Prerequisites - Run This First!

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
import os
import sys

drive.mount('/content/drive')

# Set up paths
DRIVE_PROJECT_PATH = '/content/drive/MyDrive/Constitutional_AI_Project'
PROJECT_DIR = '/content/Constitutional_AI_Project'
GITHUB_REPO = 'https://github.com/ychleee/Constitutional_AI_Project.git'

# Clone or update repository
if not os.path.exists(PROJECT_DIR):
    print('📥 Cloning repository...')
    !git clone {GITHUB_REPO} {PROJECT_DIR}
else:
    print('📥 Updating repository...')
    !cd {PROJECT_DIR} && git pull origin main

# Add project to Python path
sys.path.append(PROJECT_DIR)

# Install required dependencies
print('📦 Installing dependencies...')
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers>=4.35.0 datasets>=2.14.0 accelerate>=0.24.0
!pip install -q peft>=0.6.0 trl>=0.7.0 bitsandbytes>=0.41.0
!pip install -q einops tensorboard wandb safetensors
!pip install -q jsonlines pandas numpy scikit-learn matplotlib seaborn tqdm rich

# Create necessary directories in Drive
directories = [
    f'{DRIVE_PROJECT_PATH}/data/red_team',
    f'{DRIVE_PROJECT_PATH}/data/helpfulness',
    f'{DRIVE_PROJECT_PATH}/data/sl_datasets',
    f'{DRIVE_PROJECT_PATH}/data/rl_datasets',
    f'{DRIVE_PROJECT_PATH}/data/evaluation',
    f'{DRIVE_PROJECT_PATH}/models/deontological/sl_cai',
    f'{DRIVE_PROJECT_PATH}/models/deontological/reward_model',
    f'{DRIVE_PROJECT_PATH}/models/deontological/rl_cai',
    f'{DRIVE_PROJECT_PATH}/models/consequentialist/sl_cai',
    f'{DRIVE_PROJECT_PATH}/models/consequentialist/reward_model',
    f'{DRIVE_PROJECT_PATH}/models/consequentialist/rl_cai',
    f'{DRIVE_PROJECT_PATH}/results/sl_training_logs',
    f'{DRIVE_PROJECT_PATH}/results/rl_training_logs',
    f'{DRIVE_PROJECT_PATH}/results/evaluation',
    f'{DRIVE_PROJECT_PATH}/results/figures'
]

for dir_path in directories:
    os.makedirs(dir_path, exist_ok=True)

print('✅ Prerequisites complete!')

In [None]:
import osimport sysimport jsonimport torchfrom pathlib import Path# Add project to pathPROJECT_DIR = "/content/Constitutional_AI_Project_v2"sys.path.append(PROJECT_DIR)# Load configuration from setup, or create default if not foundCONFIG_PATH = '/content/current_config.json'if os.path.exists(CONFIG_PATH):    with open(CONFIG_PATH, 'r') as f:        CONFIG = json.load(f)    print(f"✅ Loaded existing config for: {CONFIG['model']}")else:    # Default configuration if setup notebook wasn't run    print("⚠️ No config found, creating default configuration...")        # Detect GPU and set appropriate config    if torch.cuda.is_available():        gpu_name = torch.cuda.get_device_name(0)        if "T4" in gpu_name:            CONFIG = {                "model": "EleutherAI/pythia-1.4b",                "quantization": "int8",                "batch_size": 2,                "gradient_accumulation": 8,                "max_length": 512,                "lora_r": 16,                "lora_alpha": 32,                "learning_rate": 2e-5,                "fp16": True,                "gradient_checkpointing": True            }            print(f"📱 Detected T4 GPU, using Pythia-1.4B with INT8")        elif "A100" in gpu_name:            CONFIG = {                "model": "mistralai/Mistral-7B-Instruct-v0.2",                "quantization": None,                "batch_size": 4,                "gradient_accumulation": 4,                "max_length": 1024,                "lora_r": 64,                "lora_alpha": 128,                "learning_rate": 1e-4,                "bf16": True,                "gradient_checkpointing": False            }            print(f"🚀 Detected A100 GPU, using Mistral-7B")        else:            # Default to small model for unknown GPU            CONFIG = {                "model": "EleutherAI/pythia-1.4b",                "quantization": "int8",                "batch_size": 2,                "gradient_accumulation": 8,                "max_length": 512,                "lora_r": 16,                "lora_alpha": 32,                "learning_rate": 2e-5,                "fp16": True,                "gradient_checkpointing": True            }            print(f"🔧 Using default config for {gpu_name}")    else:        print("❌ No GPU detected\! This will be very slow.")        CONFIG = {            "model": "EleutherAI/pythia-410m",            "quantization": None,            "batch_size": 1,            "gradient_accumulation": 1,            "max_length": 256,            "lora_r": 8,            "lora_alpha": 16,            "learning_rate": 2e-5,            "fp16": False,            "gradient_checkpointing": True        }        # Save config for other notebooks    with open(CONFIG_PATH, 'w') as f:        json.dump(CONFIG, f, indent=2)    print(f"💾 Config saved to {CONFIG_PATH}")# PathsDRIVE_PROJECT_PATH = '/content/drive/MyDrive/Constitutional_AI_Project_v2'DATA_PATH = f"{DRIVE_PROJECT_PATH}/data"MODEL_PATH = f"{DRIVE_PROJECT_PATH}/models"print(f"Using model: {CONFIG['model']}")print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

## 2. Generate Preference Data

In [None]:
from src.data.preference_generator import PreferenceGenerator
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import random

def load_sl_model(model_path, base_model_name):
    """Load SL-CAI trained model"""
    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # Load LoRA weights
    model = PeftModel.from_pretrained(base_model, model_path)
    return model

def generate_preferences_for_constitution(constitution_type):
    """Generate preference data for a constitution"""
    
    print(f"\n🔄 Generating preferences for {constitution_type} model...")
    
    # Load SL-CAI model
    model_path = f"{MODEL_PATH}/{constitution_type}/sl_cai"
    model = load_sl_model(model_path, CONFIG['model'])
    tokenizer = AutoTokenizer.from_pretrained(CONFIG['model'])
    
    # Initialize preference generator
    pref_generator = PreferenceGenerator(
        feedback_model_name=CONFIG['model'],  # Can use same or different model
        constitution_path=f"{PROJECT_DIR}/constitutions/{constitution_type}/principles.json",
        constitution_type=constitution_type,
        use_soft_labels=True,
        min_score_difference=0.1
    )
    
    # Load test prompts
    with open(f"{DATA_PATH}/red_team/sample_red_team.json", 'r') as f:
        red_team_data = json.load(f)
    prompts = [item['prompt'] for item in red_team_data['prompts'][:20]]  # Use 20 prompts
    
    # Generate multiple responses per prompt
    def generate_response(prompt):
        formatted = f"Human: {prompt}\n\nAssistant:"
        inputs = tokenizer(formatted, return_tensors="pt", truncation=True).to("cuda")  # Fixed: Move inputs to GPU
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                temperature=0.8 + random.random() * 0.4,  # Vary temperature
                do_sample=True,
                top_p=0.9,
                pad_token_id=tokenizer.pad_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("Assistant:")[-1].strip()
    
    # Generate preference pairs
    all_pairs = pref_generator.process_dataset(
        prompts=prompts,
        response_generator=generate_response,
        responses_per_prompt=4,
        show_progress=True
    )
    
    # Save preference data
    output_path = f"{DATA_PATH}/rl_datasets/{constitution_type}_preferences.jsonl"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    pref_generator.save_preferences(all_pairs, output_path)
    
    print(f"✅ Generated {len(all_pairs)} preference pairs")
    
    # Clean up
    del model
    torch.cuda.empty_cache()
    
    return output_path

# Generate for both constitutions
deont_pref_path = generate_preferences_for_constitution("deontological")
conseq_pref_path = generate_preferences_for_constitution("consequentialist")

## 3. Train Reward Models

In [None]:
from src.training.reward_model import RewardModelTrainer

def train_reward_model(constitution_type, pref_data_path):
    """Train reward model for a constitution"""
    
    print(f"\n🎯 Training reward model for {constitution_type}...")
    
    output_dir = f"{MODEL_PATH}/{constitution_type}/reward_model"
    
    # Initialize trainer
    trainer = RewardModelTrainer(
        model_name=CONFIG['model'],
        constitution_type=constitution_type,
        output_dir=output_dir,
        use_soft_labels=True
    )
    
    # Train
    trainer.train(
        train_data_path=pref_data_path,
        val_data_path=None,  # Could split data for validation
        epochs=2,  # Fewer epochs for reward model
        batch_size=CONFIG.get('batch_size', 2) * 2,  # Can use larger batch
        learning_rate=1e-5
    )
    
    print(f"✅ Reward model saved to {output_dir}")
    return output_dir

# Train reward models
deont_reward_path = train_reward_model("deontological", deont_pref_path)
conseq_reward_path = train_reward_model("consequentialist", conseq_pref_path)

## 4. PPO Training with TRL

In [None]:
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset

class PPODataset(Dataset):
    """Dataset for PPO training"""
    def __init__(self, prompts):
        self.prompts = prompts
    
    def __len__(self):
        return len(self.prompts)
    
    def __getitem__(self, idx):
        return {"prompt": self.prompts[idx]}

def train_ppo(constitution_type, reward_model_path):
    """Train model with PPO using constitutional rewards"""
    
    print(f"\n🚀 PPO training for {constitution_type} model...")
    
    # Load SL-CAI model as starting point
    sl_model_path = f"{MODEL_PATH}/{constitution_type}/sl_cai"
    
    # PPO configuration
    ppo_config = PPOConfig(
        model_name=CONFIG['model'],
        learning_rate=1e-5,
        batch_size=CONFIG.get('batch_size', 2),
        mini_batch_size=1,
        gradient_accumulation_steps=CONFIG.get('gradient_accumulation', 8),
        ppo_epochs=4,
        horizon=10000,
        target_kl=0.1,
        init_kl_coef=0.2,
        seed=42,
        use_score_scaling=True,
        use_score_norm=True,
        score_clip=10.0,
    )
    
    # Load models
    tokenizer = AutoTokenizer.from_pretrained(CONFIG['model'])
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load model with value head for PPO
    model = AutoModelForCausalLMWithValueHead.from_pretrained(
        CONFIG['model'],
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # Load reward model
    from src.training.reward_model import RewardModel
    reward_model = RewardModel(CONFIG['model'])
    reward_model.load_state_dict(
        torch.load(f"{reward_model_path}/final_model/reward_model.pt")
    )
    reward_model.eval()
    reward_model.cuda()
    
    # Create PPO trainer
    ppo_trainer = PPOTrainer(
        config=ppo_config,
        model=model,
        tokenizer=tokenizer,
        dataset=None,  # We'll generate on the fly
        data_collator=None,
    )
    
    # Load training prompts
    with open(f"{DATA_PATH}/red_team/sample_red_team.json", 'r') as f:
        red_team_data = json.load(f)
    prompts = [f"Human: {item['prompt']}\n\nAssistant:" 
               for item in red_team_data['prompts'][:50]]
    
    # Training loop
    for epoch in range(2):  # 2 epochs
        print(f"\nEpoch {epoch + 1}")
        
        for batch_start in range(0, len(prompts), ppo_config.batch_size):
            batch_prompts = prompts[batch_start:batch_start + ppo_config.batch_size]
            
            # Tokenize
            inputs = tokenizer(
                batch_prompts,
                return_tensors="pt",
                padding=True,
                truncation=True
            )
            
            # Generate responses
            with torch.no_grad():
                response_ids = model.generate(
                    **inputs,
                    max_new_tokens=150,
                    do_sample=True,
                    temperature=0.7,
                    pad_token_id=tokenizer.pad_token_id
                )
            
            # Get responses text
            responses = tokenizer.batch_decode(
                response_ids[:, inputs['input_ids'].shape[1]:],
                skip_special_tokens=True
            )
            
            # Compute rewards using reward model
            rewards = []
            for prompt, response in zip(batch_prompts, responses):
                # Format for reward model
                text = f"{prompt}{response}"
                encoding = tokenizer(
                    text,
                    return_tensors="pt",
                    truncation=True,
                    max_length=512
                ).to('cuda')
                
                with torch.no_grad():
                    reward = reward_model(
                        input_ids=encoding['input_ids'],
                        attention_mask=encoding['attention_mask']
                    )['logits'].squeeze().item()
                
                rewards.append(reward)
            
            # Convert rewards to tensor
            rewards_tensor = torch.tensor(rewards).cuda()
            
            # PPO step
            stats = ppo_trainer.step(
                queries=inputs['input_ids'],
                responses=response_ids[:, inputs['input_ids'].shape[1]:],
                scores=rewards_tensor
            )
            
            # Log stats
            if batch_start % 10 == 0:
                print(f"  Batch {batch_start}/{len(prompts)}: "
                      f"Reward: {rewards_tensor.mean().item():.3f}")
    
    # Save model
    output_dir = f"{MODEL_PATH}/{constitution_type}/rl_cai"
    os.makedirs(output_dir, exist_ok=True)
    
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    print(f"✅ RL-CAI model saved to {output_dir}")
    
    # Clean up
    del model
    del reward_model
    torch.cuda.empty_cache()
    
    return output_dir

# Train both models with PPO
deont_rl_path = train_ppo("deontological", deont_reward_path)
conseq_rl_path = train_ppo("consequentialist", conseq_reward_path)

## 5. Quick Test of RL-CAI Models

In [None]:
def test_rl_model(model_path, prompt):
    """Test an RL-CAI trained model"""
    
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Format prompt
    formatted = f"Human: {prompt}\n\nAssistant:"
    inputs = tokenizer(formatted, return_tensors="pt").to("cuda")  # Fixed: Move inputs to GPU
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("Assistant:")[-1].strip()

# Test on ethical dilemma
test_prompt = "Is it ever acceptable to lie to protect someone's feelings?"

print(f"Test prompt: {test_prompt}\n")

print("Deontological RL-CAI response:")
deont_response = test_rl_model(deont_rl_path, test_prompt)
print(deont_response)

print("\n" + "="*50 + "\n")

print("Consequentialist RL-CAI response:")
conseq_response = test_rl_model(conseq_rl_path, test_prompt)
print(conseq_response)

## 6. Save Training Summary

In [None]:
import datetime

# Create training summary
rl_summary = {
    "training_date": datetime.datetime.now().isoformat(),
    "base_model": CONFIG['model'],
    "gpu_type": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU",
    "stages_completed": [
        "preference_generation",
        "reward_model_training",
        "ppo_training"
    ],
    "models": {
        "deontological": {
            "sl_cai": f"{MODEL_PATH}/deontological/sl_cai",
            "reward_model": deont_reward_path,
            "rl_cai": deont_rl_path,
            "preferences": deont_pref_path
        },
        "consequentialist": {
            "sl_cai": f"{MODEL_PATH}/consequentialist/sl_cai",
            "reward_model": conseq_reward_path,
            "rl_cai": conseq_rl_path,
            "preferences": conseq_pref_path
        }
    },
    "config": CONFIG
}

# Save summary
summary_path = f"{DRIVE_PROJECT_PATH}/results/rl_training_summary.json"
os.makedirs(os.path.dirname(summary_path), exist_ok=True)

with open(summary_path, 'w') as f:
    json.dump(rl_summary, f, indent=2)

print(f"✅ Training summary saved to {summary_path}")
print("\n🎉 RL-CAI training complete!")
print("\nNext steps:")
print("1. Run 03_evaluation_colab.ipynb to evaluate the models")
print("2. Compare responses on divergent ethical dilemmas")
print("3. Analyze chain-of-thought reasoning patterns")