# Constitutional AI - Setup Notebook

This notebook sets up the complete environment for Constitutional AI training on Google Colab.

Features:
- Automatic GPU detection and configuration
- Google Drive persistence for data and models
- GitHub repository synchronization
- Automatic data download and preprocessing

## 0. Prerequisites - Run This First!

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
import os
import sys

drive.mount('/content/drive')

# Set up paths - UPDATED FOR V2
DRIVE_PROJECT_PATH = '/content/drive/MyDrive/Constitutional_AI_Project_v2'
PROJECT_DIR = '/content/Constitutional_AI_Project_v2'
GITHUB_REPO = 'https://github.com/ychleee/CAI_project.git'

# Clone or update repository
if not os.path.exists(PROJECT_DIR):
    print('üì• Cloning repository...')
    !git clone {GITHUB_REPO} {PROJECT_DIR}
else:
    print('üì• Updating repository...')
    !cd {PROJECT_DIR} && git pull origin main

# Add project to Python path
sys.path.append(PROJECT_DIR)

# Install required dependencies
print('üì¶ Installing dependencies...')
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers>=4.35.0 datasets>=2.14.0 accelerate>=0.24.0
!pip install -q peft>=0.6.0 trl>=0.7.0 bitsandbytes>=0.41.0
!pip install -q einops tensorboard wandb safetensors
!pip install -q jsonlines pandas numpy scikit-learn matplotlib seaborn tqdm rich

# Create necessary directories in Drive for V2
directories = [
    f'{DRIVE_PROJECT_PATH}/data/red_team',
    f'{DRIVE_PROJECT_PATH}/data/helpfulness',
    f'{DRIVE_PROJECT_PATH}/data/sl_datasets',
    f'{DRIVE_PROJECT_PATH}/data/rl_datasets',
    f'{DRIVE_PROJECT_PATH}/data/evaluation',
    f'{DRIVE_PROJECT_PATH}/models/deontological/sl_cai',
    f'{DRIVE_PROJECT_PATH}/models/deontological/reward_model',
    f'{DRIVE_PROJECT_PATH}/models/deontological/rl_cai',
    f'{DRIVE_PROJECT_PATH}/models/consequentialist/sl_cai',
    f'{DRIVE_PROJECT_PATH}/models/consequentialist/reward_model',
    f'{DRIVE_PROJECT_PATH}/models/consequentialist/rl_cai',
    f'{DRIVE_PROJECT_PATH}/results/sl_training_logs',
    f'{DRIVE_PROJECT_PATH}/results/rl_training_logs',
    f'{DRIVE_PROJECT_PATH}/results/evaluation',
    f'{DRIVE_PROJECT_PATH}/results/figures'
]

for dir_path in directories:
    os.makedirs(dir_path, exist_ok=True)

print('‚úÖ Prerequisites complete for v2 project!')

## 1. Setup Environment and Configuration

In [None]:
import torch
import json
from pathlib import Path

# Function to get correct LoRA target modules for each model
def get_target_modules(model_name):
    """Get the correct target modules for LoRA based on model architecture"""
    model_name_lower = model_name.lower()
    
    if 'pythia' in model_name_lower:
        return ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]
    elif 'mistral' in model_name_lower or 'llama' in model_name_lower:
        return ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    elif 'gpt2' in model_name_lower or 'gpt-2' in model_name_lower:
        return ["c_attn", "c_proj"]
    elif 'opt' in model_name_lower:
        return ["q_proj", "v_proj", "k_proj", "out_proj"]
    else:
        # Default fallback
        print(f"‚ö†Ô∏è Unknown model architecture for {model_name}, using default target modules")
        return ["q_proj", "v_proj"]

# Detect GPU and create appropriate configuration
def create_config():
    """Create configuration based on available GPU"""
    
    if torch.cuda.is_available():
        gpu_info = torch.cuda.get_device_properties(0)
        gpu_name = gpu_info.name
        gpu_memory = gpu_info.total_memory / 1024**3  # Convert to GB
        
        print(f"‚úÖ GPU Available: {gpu_name}")
        print(f"   Memory: {gpu_memory:.2f} GB")
        
        if "T4" in gpu_name:
            print("üì± Using T4 configuration (Free Colab)")
            config = {
                "model": "EleutherAI/pythia-1.4b",
                "quantization": "int8",
                "batch_size": 2,
                "gradient_accumulation": 8,
                "max_length": 512,
                "lora_r": 16,
                "lora_alpha": 32,
                "learning_rate": 2e-5,
                "fp16": True,
                "bf16": False,
                "gradient_checkpointing": True,
                "gpu_type": "t4"
            }
        elif "A100" in gpu_name:
            print("üöÄ Using A100 configuration (Colab Pro/Pro+)")
            config = {
                "model": "mistralai/Mistral-7B-Instruct-v0.2",
                "quantization": None,
                "batch_size": 4,
                "gradient_accumulation": 4,
                "max_length": 1024,
                "lora_r": 64,
                "lora_alpha": 128,
                "learning_rate": 1e-4,
                "fp16": False,
                "bf16": True,
                "gradient_checkpointing": False,
                "gpu_type": "a100"
            }
        elif "V100" in gpu_name:
            print("‚ö° Using V100 configuration (Colab Pro)")
            config = {
                "model": "mistralai/Mistral-7B-Instruct-v0.2",
                "quantization": "int4",
                "batch_size": 2,
                "gradient_accumulation": 8,
                "max_length": 768,
                "lora_r": 32,
                "lora_alpha": 64,
                "learning_rate": 1e-4,
                "fp16": True,
                "bf16": False,
                "gradient_checkpointing": True,
                "gpu_type": "v100"
            }
        else:
            print(f"üîß Using default configuration for {gpu_name}")
            config = {
                "model": "EleutherAI/pythia-1.4b",
                "quantization": "int8",
                "batch_size": 2,
                "gradient_accumulation": 8,
                "max_length": 512,
                "lora_r": 16,
                "lora_alpha": 32,
                "learning_rate": 2e-5,
                "fp16": True,
                "bf16": False,
                "gradient_checkpointing": True,
                "gpu_type": "unknown"
            }
    else:
        print("‚ùå No GPU detected! Using CPU configuration (very slow)")
        config = {
            "model": "EleutherAI/pythia-410m",
            "quantization": None,
            "batch_size": 1,
            "gradient_accumulation": 1,
            "max_length": 256,
            "lora_r": 8,
            "lora_alpha": 16,
            "learning_rate": 2e-5,
            "fp16": False,
            "bf16": False,
            "gradient_checkpointing": True,
            "gpu_type": "cpu"
        }
    
    # Add target modules based on model
    config["lora_target_modules"] = get_target_modules(config["model"])
    
    return config

# Create and save configuration
CONFIG = create_config()
CONFIG_PATH = '/content/current_config.json'

with open(CONFIG_PATH, 'w') as f:
    json.dump(CONFIG, f, indent=2)

print(f"\nüíæ Configuration saved to {CONFIG_PATH}")
print(f"üìä Model: {CONFIG['model']}")
print(f"üìä LoRA target modules: {CONFIG['lora_target_modules']}")
print(f"üìä Batch size: {CONFIG['batch_size']}")
print(f"üìä Effective batch size: {CONFIG['batch_size'] * CONFIG['gradient_accumulation']}")

## 2. Download Anthropic HH-RLHF Data

In [None]:
import json
import os

# Check if data already exists
RED_TEAM_PATH = f"{DRIVE_PROJECT_PATH}/data/red_team/sample_red_team.json"
HELPFUL_PATH = f"{DRIVE_PROJECT_PATH}/data/helpfulness/sample_helpful.json"

if os.path.exists(RED_TEAM_PATH) and os.path.exists(HELPFUL_PATH):
    print("‚úÖ Data already exists in Drive")
else:
    print("üì• Downloading Anthropic HH-RLHF data...")
    
    # Install datasets library if not already installed
    !pip install -q datasets
    
    from datasets import load_dataset
    
    # Load Anthropic HH-RLHF dataset
    print("Loading dataset from Hugging Face...")
    dataset = load_dataset("Anthropic/hh-rlhf", split="train[:1000]")  # Load first 1000 samples
    
    # Process and create sample datasets
    red_team_samples = []
    helpful_samples = []
    
    harmful_keywords = ['hack', 'steal', 'kill', 'hurt', 'illegal', 'weapon', 'drug', 'violence']
    
    for item in dataset:
        if 'chosen' in item:
            # Extract prompt from chosen response
            text = item['chosen']
            if 'Human:' in text:
                prompt = text.split('Human:')[1].split('Assistant:')[0].strip()
                
                # Categorize as harmful or helpful
                is_harmful = any(keyword in prompt.lower() for keyword in harmful_keywords)
                
                if is_harmful and len(red_team_samples) < 100:
                    red_team_samples.append({
                        "prompt": prompt,
                        "category": "red_team",
                        "source": "hh-rlhf"
                    })
                elif not is_harmful and len(helpful_samples) < 100:
                    helpful_samples.append({
                        "prompt": prompt,
                        "category": "helpful",
                        "source": "hh-rlhf"
                    })
    
    # Save red team data
    red_team_data = {
        "metadata": {
            "source": "Anthropic HH-RLHF",
            "total_prompts": len(red_team_samples)
        },
        "prompts": red_team_samples
    }
    
    with open(RED_TEAM_PATH, 'w') as f:
        json.dump(red_team_data, f, indent=2)
    
    print(f"‚úÖ Saved {len(red_team_samples)} red team samples to {RED_TEAM_PATH}")
    
    # Save helpful data
    helpful_data = {
        "metadata": {
            "source": "Anthropic HH-RLHF",
            "total_prompts": len(helpful_samples)
        },
        "prompts": helpful_samples
    }
    
    with open(HELPFUL_PATH, 'w') as f:
        json.dump(helpful_data, f, indent=2)
    
    print(f"‚úÖ Saved {len(helpful_samples)} helpful samples to {HELPFUL_PATH}")

print("\nüìä Data ready for training!")

## 3. Test Model Loading

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def test_model_loading():
    """Test that the model can be loaded with current configuration"""
    
    model_name = CONFIG['model']
    print(f"üß™ Testing model loading: {model_name}")
    
    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print("‚úÖ Tokenizer loaded")
        
        # Prepare model kwargs
        model_kwargs = {
            "device_map": "auto",
            "trust_remote_code": True
        }
        
        # Add quantization if specified
        if CONFIG['quantization'] == 'int8':
            model_kwargs["load_in_8bit"] = True
        elif CONFIG['quantization'] == 'int4':
            model_kwargs["load_in_4bit"] = True
            model_kwargs["bnb_4bit_compute_dtype"] = torch.float16
        else:
            if CONFIG.get('bf16'):
                model_kwargs["torch_dtype"] = torch.bfloat16
            elif CONFIG.get('fp16'):
                model_kwargs["torch_dtype"] = torch.float16
            else:
                model_kwargs["torch_dtype"] = torch.float32
        
        # Load model
        model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
        print("‚úÖ Model loaded successfully")
        
        # Test generation
        test_prompt = "Hello, how are you today?"
        inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")  # Fixed: Move inputs to GPU
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"\nüìù Test generation successful:")
        print(f"Input: {test_prompt}")
        print(f"Output: {response}")
        
        # Clean up
        del model
        torch.cuda.empty_cache()
        print("\n‚úÖ Model test complete. Memory cleared.")
        
    except Exception as e:
        print(f"‚ùå Error loading model: {e}")
        print("\nTroubleshooting:")
        print("1. Check GPU memory: !nvidia-smi")
        print("2. Try restarting runtime: Runtime -> Restart runtime")
        print("3. Use smaller model or more aggressive quantization")

# Test the model
test_model_loading()

## 4. Utility Functions

In [None]:
# Utility functions for training

def get_gpu_memory():
    """Get current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        free = torch.cuda.mem_get_info()[0] / 1024**3
        total = torch.cuda.mem_get_info()[1] / 1024**3
        
        print(f"GPU Memory Status:")
        print(f"  Allocated: {allocated:.2f} GB")
        print(f"  Reserved: {reserved:.2f} GB")
        print(f"  Free: {free:.2f} GB")
        print(f"  Total: {total:.2f} GB")
        return free
    return 0

def clear_gpu_memory():
    """Clear GPU memory"""
    import gc
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    print("‚úÖ GPU memory cleared")

def save_checkpoint_to_drive(model, tokenizer, path):
    """Save model checkpoint to Google Drive"""
    save_path = f"{DRIVE_PROJECT_PATH}/models/{path}"
    os.makedirs(save_path, exist_ok=True)
    
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"‚úÖ Checkpoint saved to {save_path}")

# Check GPU memory
get_gpu_memory()

## 5. Setup Complete!

In [None]:
print("üéâ Setup Complete!\n")
print("="*50)
print("Summary:")
print(f"  GPU: {CONFIG.get('gpu_type', 'unknown').upper()}")
print(f"  Model: {CONFIG['model']}")
print(f"  Quantization: {CONFIG.get('quantization', 'None')}")
print(f"  Project Dir: {PROJECT_DIR}")
print(f"  Drive Dir: {DRIVE_PROJECT_PATH}")
print(f"  Config saved: {CONFIG_PATH}")
print("="*50)
print("\nüìö Next Steps:")
print("  1. Run 01_sl_training_colab.ipynb for SL-CAI training")
print("  2. Run 02_rl_training_colab.ipynb for RL-CAI training")
print("  3. Run 03_evaluation_colab.ipynb to evaluate models")
print("\nüí° Tips:")
print("  - Monitor GPU: !nvidia-smi")
print("  - Clear memory: clear_gpu_memory()")
print("  - Check memory: get_gpu_memory()")