# DPO Training for Psychologist Agent

This notebook performs DPO (Direct Preference Optimization) fine-tuning on Llama-3.1-8B-Instruct.

**Environment**: Google Colab with T4 GPU (15GB VRAM)

## Workflow
1. Install dependencies
2. Load configuration
3. Load DPO dataset
4. Load base model with QLoRA 4-bit
5. Initialize DPOTrainer
6. Train for 3 epochs
7. Save LoRA adapter weights
8. Merge weights with base model
9. Convert to GGUF format
10. Download model

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install -q transformers>=4.40.0
!pip install -q trl>=0.8.0
!pip install -q peft>=0.10.0
!pip install -q bitsandbytes>=0.43.0
!pip install -q accelerate>=0.28.0
!pip install -q datasets
!pip install -q pyyaml
!pip install -q torch --upgrade

In [None]:
# Check GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
import json
import os
import yaml
import gc
from pathlib import Path
from datetime import datetime

import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)
from trl import DPOTrainer, DPOConfig

## 2. Configuration

In [None]:
# Configuration (mirrors configs/dpo_config.yaml)
CONFIG = {
    # Model
    "base_model": "meta-llama/Llama-3.1-8B-Instruct",
    
    # Quantization
    "load_in_4bit": True,
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_compute_dtype": torch.bfloat16,
    "bnb_4bit_use_double_quant": True,
    
    # LoRA
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
    
    # Training
    "num_epochs": 3,
    "batch_size": 2,
    "gradient_accumulation_steps": 8,
    "learning_rate": 5e-5,
    "beta": 0.1,
    "max_length": 1024,
    "max_prompt_length": 512,
    "warmup_ratio": 0.1,
    
    # Paths
    "train_file": "data/dpo/train.jsonl",
    "eval_file": "data/dpo/eval.jsonl",
    "output_dir": "outputs/dpo-training",
    "adapter_dir": "outputs/dpo-adapter",
    "merged_dir": "outputs/merged-model",
    "gguf_output": "models/psychologist-8b-q4_k_m.gguf",
    
    # GGUF
    "quantization_type": "Q4_K_M"
}

# Create directories
for dir_key in ["output_dir", "adapter_dir", "merged_dir"]:
    os.makedirs(CONFIG[dir_key], exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("data/dpo", exist_ok=True)

print("Configuration loaded!")

## 3. Load Dataset

**Option A**: Clone from GitHub  
**Option B**: Upload file directly  
**Option C**: Mount Google Drive

In [None]:
# Option A: Clone from GitHub (uncomment if needed)
# !git clone https://github.com/yuchangyuan1/6895_project_Agent.git temp_repo
# !cp temp_repo/data/dpo/*.jsonl data/dpo/
# !rm -rf temp_repo

# Option B: Upload files directly
# from google.colab import files
# uploaded = files.upload()  # Upload train.jsonl and eval.jsonl
# !mv train.jsonl data/dpo/
# !mv eval.jsonl data/dpo/

# Option C: Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# !cp /content/drive/MyDrive/dpo_data/*.jsonl data/dpo/

In [None]:
def load_dpo_dataset(filepath: str) -> Dataset:
    """Load DPO dataset from JSONL file."""
    records = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line.strip()))
    
    # Convert to HuggingFace Dataset format
    dataset = Dataset.from_dict({
        "prompt": [r["prompt"] for r in records],
        "chosen": [r["chosen"] for r in records],
        "rejected": [r["rejected"] for r in records]
    })
    
    print(f"Loaded {len(dataset)} records from {filepath}")
    return dataset

# Load datasets
train_dataset = load_dpo_dataset(CONFIG["train_file"])
eval_dataset = load_dpo_dataset(CONFIG["eval_file"])

print(f"\nTrain samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")
print(f"\nSample record:")
print(f"Prompt: {train_dataset[0]['prompt'][:100]}...")

## 4. HuggingFace Login

In [None]:
from huggingface_hub import login

# Enter your HuggingFace token (requires Llama model access)
# Get token from: https://huggingface.co/settings/tokens
# Request access at: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
HF_TOKEN = ""  # <-- Enter your token here

if HF_TOKEN:
    login(token=HF_TOKEN)
    print("Logged in to HuggingFace")
else:
    print("WARNING: No HuggingFace token. Interactive login:")
    # login()  # Uncomment for interactive login

## 5. Load Base Model with QLoRA

In [None]:
def load_model_and_tokenizer(config: dict):
    """Load model with 4-bit quantization and LoRA configuration."""
    
    # BitsAndBytes configuration
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=config["load_in_4bit"],
        bnb_4bit_quant_type=config["bnb_4bit_quant_type"],
        bnb_4bit_compute_dtype=config["bnb_4bit_compute_dtype"],
        bnb_4bit_use_double_quant=config["bnb_4bit_use_double_quant"]
    )
    
    print(f"Loading tokenizer: {config['base_model']}")
    tokenizer = AutoTokenizer.from_pretrained(
        config["base_model"],
        trust_remote_code=True
    )
    
    # Set padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"  # Required for batch generation
    
    print(f"Loading model with 4-bit quantization: {config['base_model']}")
    model = AutoModelForCausalLM.from_pretrained(
        config["base_model"],
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16
    )
    
    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)
    
    # Enable gradient checkpointing for memory efficiency
    model.gradient_checkpointing_enable()
    
    # LoRA configuration
    lora_config = LoraConfig(
        r=config["lora_r"],
        lora_alpha=config["lora_alpha"],
        lora_dropout=config["lora_dropout"],
        target_modules=config["target_modules"],
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    print(f"\nLoRA config:")
    print(f"  r={config['lora_r']}, alpha={config['lora_alpha']}")
    print(f"  targets={config['target_modules']}")
    
    # Print memory usage
    if torch.cuda.is_available():
        memory_used = torch.cuda.memory_allocated() / 1024**3
        print(f"\nGPU memory used: {memory_used:.2f} GB")
    
    return model, tokenizer, lora_config

# Load model
model, tokenizer, lora_config = load_model_and_tokenizer(CONFIG)

## 6. Initialize DPO Trainer

In [None]:
# DPO Training configuration
dpo_config = DPOConfig(
    output_dir=CONFIG["output_dir"],
    num_train_epochs=CONFIG["num_epochs"],
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    learning_rate=CONFIG["learning_rate"],
    lr_scheduler_type="cosine",
    warmup_ratio=CONFIG["warmup_ratio"],
    beta=CONFIG["beta"],  # DPO temperature
    max_length=CONFIG["max_length"],
    max_prompt_length=CONFIG["max_prompt_length"],
    fp16=True,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    seed=42,
    report_to="none"  # Set to "wandb" if using W&B
)

print("DPO Config:")
print(f"  Epochs: {dpo_config.num_train_epochs}")
print(f"  Batch size: {dpo_config.per_device_train_batch_size}")
print(f"  Gradient accumulation: {dpo_config.gradient_accumulation_steps}")
print(f"  Effective batch size: {dpo_config.per_device_train_batch_size * dpo_config.gradient_accumulation_steps}")
print(f"  Learning rate: {dpo_config.learning_rate}")
print(f"  Beta (DPO temp): {dpo_config.beta}")

In [None]:
# Initialize DPO Trainer
trainer = DPOTrainer(
    model=model,
    args=dpo_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    peft_config=lora_config
)

print("\nDPO Trainer initialized!")
print(f"Trainable parameters: {trainer.model.print_trainable_parameters()}")

## 7. Train Model

In [None]:
# Start training
print("Starting DPO training...")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*50)

# Train
train_result = trainer.train()

print("="*50)
print(f"End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Training complete!")

In [None]:
# Print training metrics
print("\nTraining Results:")
print(f"  Total steps: {train_result.global_step}")
print(f"  Training loss: {train_result.training_loss:.4f}")

# Evaluate
print("\nRunning evaluation...")
eval_results = trainer.evaluate()
print(f"\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}" if isinstance(value, float) else f"  {key}: {value}")

## 8. Save LoRA Adapter

In [None]:
# Save the LoRA adapter weights
print(f"Saving LoRA adapter to {CONFIG['adapter_dir']}...")
trainer.save_model(CONFIG["adapter_dir"])
tokenizer.save_pretrained(CONFIG["adapter_dir"])

print(f"Adapter saved!")
!ls -la {CONFIG["adapter_dir"]}

## 9. Merge LoRA Weights with Base Model

In [None]:
# Clear GPU memory before merging
del model
del trainer
torch.cuda.empty_cache()
gc.collect()

print("Cleared GPU memory")
if torch.cuda.is_available():
    print(f"Memory used: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

In [None]:
def merge_lora_weights(base_model_name: str, adapter_path: str, output_path: str):
    """Merge LoRA weights with base model."""
    print(f"Loading base model: {base_model_name}")
    
    # Load base model in fp16 for merging
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    print(f"Loading LoRA adapter: {adapter_path}")
    model = PeftModel.from_pretrained(base_model, adapter_path)
    
    print("Merging weights...")
    merged_model = model.merge_and_unload()
    
    print(f"Saving merged model to {output_path}")
    merged_model.save_pretrained(output_path)
    
    # Save tokenizer too
    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    tokenizer.save_pretrained(output_path)
    
    print("Merge complete!")
    return merged_model

# Merge weights
merged_model = merge_lora_weights(
    CONFIG["base_model"],
    CONFIG["adapter_dir"],
    CONFIG["merged_dir"]
)

In [None]:
# Check merged model files
!ls -lh {CONFIG["merged_dir"]}

## 10. Convert to GGUF Format

In [None]:
# Clear memory before GGUF conversion
del merged_model
torch.cuda.empty_cache()
gc.collect()

In [None]:
# Clone llama.cpp for GGUF conversion
!git clone https://github.com/ggerganov/llama.cpp.git
!pip install -q -r llama.cpp/requirements.txt

In [None]:
# Convert to GGUF format (FP16 first)
print("Converting to GGUF format...")
!python llama.cpp/convert_hf_to_gguf.py {CONFIG["merged_dir"]} --outfile outputs/model-fp16.gguf --outtype f16

In [None]:
# Build llama.cpp quantization tool
!cd llama.cpp && make -j4 llama-quantize

In [None]:
# Quantize to Q4_K_M
print(f"Quantizing to {CONFIG['quantization_type']}...")
!./llama.cpp/llama-quantize outputs/model-fp16.gguf {CONFIG["gguf_output"]} {CONFIG["quantization_type"]}

print("\nQuantization complete!")
!ls -lh {CONFIG["gguf_output"]}

## 11. Verify Model

In [None]:
# Quick test with llama.cpp
!cd llama.cpp && make -j4 llama-cli

In [None]:
# Test inference
test_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nI've been feeling anxious lately and can't sleep well.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

!./llama.cpp/llama-cli -m {CONFIG["gguf_output"]} -p "{test_prompt}" -n 200 --temp 0.7

## 12. Download Model

In [None]:
# Option A: Download directly (may be slow for large files)
from google.colab import files
files.download(CONFIG["gguf_output"])

In [None]:
# Option B: Save to Google Drive (recommended)
# from google.colab import drive
# drive.mount('/content/drive')
# !cp {CONFIG["gguf_output"]} /content/drive/MyDrive/
# print(f"Model saved to Google Drive!")

## 13. Training Summary

In [None]:
print("="*60)
print("DPO TRAINING COMPLETE")
print("="*60)
print(f"\nBase Model: {CONFIG['base_model']}")
print(f"Training Epochs: {CONFIG['num_epochs']}")
print(f"DPO Beta: {CONFIG['beta']}")
print(f"LoRA r: {CONFIG['lora_r']}, alpha: {CONFIG['lora_alpha']}")
print(f"\nOutput Files:")
print(f"  LoRA Adapter: {CONFIG['adapter_dir']}")
print(f"  Merged Model: {CONFIG['merged_dir']}")
print(f"  GGUF Model: {CONFIG['gguf_output']}")
print("="*60)
print("\nNext Steps:")
print("1. Download the GGUF model to your local machine")
print("2. Set LOCAL_MODEL_PATH environment variable")
print("3. Run with LLM_TYPE=LOCAL to test")
print("="*60)

## Cleanup

In [None]:
# Clean up intermediate files (optional)
# !rm -rf outputs/model-fp16.gguf
# !rm -rf llama.cpp
# !rm -rf {CONFIG["merged_dir"]}

print("Cleanup complete!")