#  Exercise 3: Optimizing RPO Training with α Parameter Tuning 🎛️
## 📘 Prerequisites
* Exercise 1: Smart dataset sampling for quality
* Exercise 2: Resource-efficient training strategies
## 🎯 The Challenge
The RPO (Regularized Preference Optimization) α parameter critically affects:

* Training stability
* Preference versus instructFT learning strength
* Base model knowledge preservation and task alignment

### Install dependencies

In [None]:
! pip install -r requirements.txt
! pip install flash-attn==2.7.3 --no-build-isolation

### Testing GPU
Please check if python recognize that you have GPU allocated, if not please go in `Settings`>`Accelerator`>`GPU T4 x 2` 

In [7]:
import os, sys
# from tensorflow.python.client import device_lib
repo_folder = os.getcwd().split('labs_AMLD25_Workshop')[0][:-1]+"/labs_AMLD25_Workshop/src" 
sys.path.append(repo_folder)

# device_lib.list_local_devices()

if you get two GPUs you can manually assign them using env variables. This step is optional since they should be automatically recognized by pytorch 

In [8]:
os.environ["WANDB_DISABLED"] = "true" ## turning off WandB logging
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

In [9]:
import torch

from typing import Optional, List, Dict
import datasets
from datasets import (
    load_dataset, 
    load_from_disk, 
    DatasetDict,
    concatenate_datasets
)

from accelerate import Accelerator, PartialState
from transformers import AutoModelForCausalLM, AutoTokenizer

from trl import (
    ModelConfig,
    DPOTrainer,
    DPOConfig,
    TrlParser,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
)

from trlabs.rl.data import (
    get_datasets, 
    DataArguments
)

from trlabs.utils import *

from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE


### Model Config

In [10]:
model_config = {
    "model_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
    "torch_dtype": "bfloat16",
    "use_peft": True, 
    "lora_r": 64,        
    "lora_alpha": 32,    # Stronger updates
    "lora_dropout": 0.1, # Prevent overfitting
}

### Data Config

In [11]:
data_params = {
  "dataset_name": "Mix 1",
  "dataset_mixer": {
    # For time constraints, use only our preference collection 
    # to see the effect of the RPO objective and its HP
    # "trl-lib/ultrafeedback_binarized": 0.02, 
    "./data/AMLD25_reuters_gentitle_1k": 1.,
  },
  "dataset_splits": ["train", "test"],
  "num_eval_samples": 100,
  "seed": 42
}

### Training Config

In [12]:
training_params =  {
    ## RPO loss active 
    ## alpha is the multiplier of NLL loss
    "rpo_alpha": .5,
    ## General
    "output_dir": f"{model_config['model_name_or_path'].split('/')[0].lower()}_ex3_output",
    "num_train_epochs": 1,
    "beta": 0.1,
    "eval_strategy": "steps",
    "eval_steps": 8,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 8,
    #@ context length and max length (max_new_token = max_length - max_prompt_length)
    "max_length": 768,
    "max_prompt_length":512,
    ## Optimizer
    "optim": "adamw_torch",
    "learning_rate": 2.0e-4,
    "weight_decay": 0.001,
    "adam_epsilon": 1.0e-8,
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "max_grad_norm": 1.0,
    ## Scheduler ##
    "warmup_steps": 10,
    "lr_scheduler_type": "cosine",
    ## Logging
    "log_level": "info",
    "logging_first_step": True,
    "logging_steps": 10
}

### RPO Training Loop

In [18]:
from trlabs.rl.train import dpo

dpo(data_params, training_params, model_config)

## Your turn!
Play with rho_alpha to get the best contribution from the two loss terms

## Give a look to the Model Generation

In [15]:
from trlabs.utils import dataset_creation, not_relevant_data

SYSTEM_PROMPT = 'You are an advanced AI system specialised in providing Reuters News title given a body text of the news.'
INSTRUCTION = "The title should be in capital letters and between 6 and 8 words in length. Please provide only the title as output and no other text or explanation."

dataset = load_dataset("ucirvine/reuters21578", 'ModApte', trust_remote_code=True)
dataset = dataset.filter(not_relevant_data).shuffle(seed=42).map(dataset_creation, fn_kwargs={"system_prompt": SYSTEM_PROMPT, "instruction": INSTRUCTION})

In [19]:
from trlabs.rl.eval import setup_model_and_lora, generate

index =10
prompt = dataset["test"][index]["system"]+dataset["test"][index]["messages"]

model, tokenizer = setup_model_and_lora(
    base_model_name = model_config["model_name_or_path"], 
    lora_path = training_params["output_dir"]
)

response = generate(prompt, model, tokenizer)
print(response)

#### Note: 
if you do not provide lora_path you can check the base model output