In [1]:
import warnings

import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser
from peft import LoraConfig, TaskType
from trl import ModelConfig, RewardConfig, RewardTrainer, get_kbit_device_map, get_peft_config, get_quantization_config


tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# python reward_modeling.py \
#     --model_name_or_path=facebook/opt-350m \
#     --output_dir="reward_modeling_anthropic_hh" \
#     --per_device_train_batch_size=64 \
#     --num_train_epochs=1 \
#     --gradient_accumulation_steps=16 \
#     --gradient_checkpointing=True \
#     --learning_rate=1.41e-5 \
#     --report_to="wandb" \
#     --remove_unused_columns=False \
#     --optim="adamw_torch" \
#     --logging_steps=10 \
#     --evaluation_strategy="steps" \
#     --max_length=512 \

In [2]:
model_config = ModelConfig(
    model_name_or_path="facebook/opt-350m",
    attn_implementation=None, # or "flash_attention_2"
    use_peft=True,
    lora_task_type=TaskType.SEQ_CLS,

)

reward_config = RewardConfig(
    output_dir="reward_modeling_anthropic_hh",
    per_device_train_batch_size=64,
    num_train_epochs=1,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    learning_rate=1.41e-5,
    report_to="wandb",
    remove_unused_columns=False,
    optim="adamw_torch",
    logging_steps=10,
    evaluation_strategy="steps",
    max_length=512,
)

# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS,
#     inference_mode=False,
#     r=8,
#     lora_alpha=32,
#     lora_dropout=0.1,
# )

In [3]:
# if __name__ == "__main__":
# parser = HfArgumentParser((reward_config, model_config))
# reward_config, model_config = parser.parse_args_into_dataclasses()
reward_config.gradient_checkpointing_kwargs = dict(use_reentrant=False)

In [4]:
################
# Model & Tokenizer
################
torch_dtype = (
    model_config.torch_dtype
    if model_config.torch_dtype in ["auto", None]
    else getattr(torch, model_config.torch_dtype)
)
quantization_config = get_quantization_config(model_config)
model_kwargs = dict(
    revision=model_config.model_revision,
    trust_remote_code=model_config.trust_remote_code,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_config.model_name_or_path, num_labels=1, **model_kwargs
)

if model_config.lora_task_type != "SEQ_CLS":
    warnings.warn(
        "You are using a `task_type` that is different than `SEQ_CLS` for PEFT. This will lead to silent bugs"
        " Make sure to pass --lora_task_type SEQ_CLS when using this script."
    )

# optimizer1 = torch.optim.SGD(model.parameters(), lr=1.41e-5)
# lr_scheduler1 = torch.optim.lr_scheduler.ExponentialLR(optimizer1, gamma=0.9)

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
################
# Dataset
################
raw_datasets = load_dataset("Anthropic/hh-rlhf")
# Tokenize chosen/rejected pairs of inputs
# Adapt this section to your needs for custom datasets

def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples

# Preprocess the dataset and filter out examples that are longer than args.max_length
raw_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)
raw_datasets = raw_datasets.filter(
    lambda x: len(x["input_ids_chosen"]) <= reward_config.max_length
    and len(x["input_ids_rejected"]) <= reward_config.max_length
)
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

Filter: 100%|██████████| 160800/160800 [00:59<00:00, 2718.79 examples/s]
Filter: 100%|██████████| 8552/8552 [00:03<00:00, 2621.38 examples/s]


In [1]:
# for k,v in train_dataset[0].items():
#     print(f'{k}:{v}')

In [2]:
# ################
# # Training
# ################
# trainer = RewardTrainer(model=model,
#                         tokenizer=tokenizer,
#                         args=reward_config,
#                         train_dataset=train_dataset,
#                         eval_dataset=eval_dataset,
#                         peft_config=get_peft_config(model_config)
#                         )
# trainer.train()
# trainer.save_model(reward_config.output_dir)

In [20]:
# # model = AutoModelForSequenceClassification.from_pretrained("gpt2")
# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS,
#     inference_mode=False,
#     r=8,
#     lora_alpha=32,
#     lora_dropout=0.1,
# )

In [6]:
trainer = RewardTrainer(model=model,
                        args=reward_config,
                        train_dataset=train_dataset,
                        eval_dataset=eval_dataset,
                        tokenizer=tokenizer,
                        # model_init=,
                        # compute_metrics=,
                        # optimizers=(optimizer1,lr_scheduler1),
                        # max_length=,
                        peft_config=get_peft_config(model_config)
                        )

In [9]:
# type(optimizer1), type(lr_scheduler1)

(torch.optim.sgd.SGD, torch.optim.lr_scheduler.ExponentialLR)

In [33]:
# from transformers import AdamW 

# optimizer = AdamW(model.parameters(), lr=1e-5)  # Example with AdamW

# from transformers import get_linear_schedule_with_warmup

# num_training_steps = len(train_dataset) * 1  # Calculate based on your data
# lr_scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=0.1 * num_training_steps,  # Example 10% warmup
#     num_training_steps=num_training_steps
# )



In [4]:
# RewardTrainer??